编程模型:数据处理层
Basic相关API
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
/**
* WordCount程序,Spark Streaming消费TCP Server发过来的实时数据的例子:
*
* 1、在master服务器上启动一个Netcat server
* `$ nc -lk 9998` (如果nc命令无效的话,我们可以用yum install -y nc来安装nc)
*
* 2、用下面的命令在在集群中将Spark Streaming应用跑起来
spark-submit --class com.dev.streaming.NetworkWordCount \
--master spark://master:7077 \
--deploy-mode client \
--driver-memory 512m \
--executor-memory 512m \
--total-executor-cores 4 \
--executor-cores 2 \
/home/hadoop-dev/spark-course/streaming/spark-streaming-basic-1.0-SNAPSHOT.jar
spark-shell --master spark://master:7077 --total-executor-cores 4 --executor-cores 2
*/
object BasicAPITest {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("NetworkWordCount")
val sc = new SparkContext(sparkConf)
// StreamingContext 编程入口
val ssc = new StreamingContext(sc, Seconds(1))
//数据接收器(Receiver)
//创建一个接收器(ReceiverInputDStream),这个接收器接收一台机器上的某个端口通过socket发送过来的数据并处理
val lines = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER)
//数据处理(Process)
//处理的逻辑,就是简单的进行word count
val words = lines.flatMap(_.split(" ")).filter(_.contains("exception"))
val wordPairs = words.map(x => (x, 1))
// reduceByKey((a: Int, b: Int) => a + b, new HashPartitioner(10) 指定suffer后分区数量和分区算法(默认是HashPartitioner)
val wordCounts = wordPairs.repartition(100).reduceByKey((a: Int, b: Int) => a + b, new HashPartitioner(10))
//结果输出(Output)
//将结果输出到控制台
wordCounts.print()
//启动Streaming处理流
ssc.start()
//等待Streaming程序终止
ssc.awaitTermination()
}
}
Join相关API
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by tangweiqun on 2018/1/6.
*/
object JoinAPITest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("NetworkWordCount")
val sc = new SparkContext(sparkConf)
// Create the context with a 5 second batch size
val ssc = new StreamingContext(sc, Seconds(5))
val lines1 = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER)
val kvs1 = lines1.map { line =>
val arr = line.split(" ")
(arr(0), arr(1))
}
val lines2 = ssc.socketTextStream("master", 9997, StorageLevel.MEMORY_AND_DISK_SER)
val kvs2 = lines2.map { line =>
val arr = line.split(" ")
(arr(0), arr(1))
}
kvs1.join(kvs2).print()
kvs1.fullOuterJoin(kvs2).print()
kvs1.leftOuterJoin(kvs2).print()
kvs1.rightOuterJoin(kvs2).print()
//启动Streaming处理流
ssc.start()
ssc.stop(false)
//等待Streaming程序终止
ssc.awaitTermination()
}
}
TransformAPI
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by tangweiqun on 2018/1/6.
*/
object TransformAPITest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("NetworkWordCount")
val sc = new SparkContext(sparkConf)
// Create the context with a 1 second batch size
val ssc = new StreamingContext(sc, Seconds(5))
val lines1 = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER)
val kvs1 = lines1.map { line =>
val arr = line.split(" ")
(arr(0), arr(1))
}
/// 实时数据
val path = "hdfs://master:9999/user/hadoop-twq/spark-course/streaming/keyvalue.txt"
val keyvalueRDD =
sc.textFile(path).map { line =>
val arr = line.split(" ")
(arr(0), arr(1))
}
/// 静态数据
kvs1.transform { rdd =>
rdd.join(keyvalueRDD)
} print()
//启动Streaming处理流
ssc.start()
ssc.stop(false)
val lines2 = ssc.socketTextStream("master", 9997, StorageLevel.MEMORY_AND_DISK_SER)
val kvs2 = lines2.map { line =>
val arr = line.split(" ")
(arr(0), arr(1))
}
//(将实时数据与静态数据相关联)
kvs1.transformWith(kvs2, (rdd1: RDD[(String, String)], rdd2: RDD[(String, String)]) => rdd1.join(rdd2))
//等待Streaming程序终止
ssc.awaitTermination()
}
}
WindowAPI
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by tangweiqun on 2018/1/6.
*/
object WindowAPITest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("NetworkWordCount")
val sc = new SparkContext(sparkConf)
// Create the context with a 1 second batch size
val ssc = new StreamingContext(sc, Seconds(1)) //// 用来控制RDD的分区
val lines = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER)
//每过2秒钟,然后显示前20秒的数据
val windowDStream = lines.window(Seconds(20), Seconds(2))
windowDStream.print()
//启动Streaming处理流
ssc.start()
//等待Streaming程序终止
ssc.awaitTermination()
ssc.stop(false)
}
}

batch interval - DStream产生的间隔,由StreamingContext指定 (这里设置为1s),控制RDD分区
window length - 窗口的长度,即一个窗口包含的RDD的个数 (这里设置为20s,必须是batch interval的倍数)
sliding interval - 窗口滑动间隔,执行窗口操作的时间段(这里设置为2s,必须是batch interval的倍数)
ReduceByKeyAndWindowAPI
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by tangweiqun on 2018/1/6.
*/
object ReduceByKeyAndWindowAPITest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("NetworkWordCount")
val sc = new SparkContext(sparkConf)
// Create the context with a 1 second batch size
val ssc = new StreamingContext(sc, Seconds(1))
ssc.checkpoint("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/checkpoint")
val lines = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
//每5秒中,统计前20秒内每个单词出现的次数
val wordPair = words.map(x => (x, 1))
val wordCounts =
wordPair.reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(20), Seconds(5))
wordCounts.print()
//启动Streaming处理流
ssc.start()
ssc.stop(false)
//接受一个ReduceFunc和一个invReduceFunc
//滑动时间比较短,窗口长度很长的场景
// 需要用checkpoint机制
val wordCountsOther =
wordPair.reduceByKeyAndWindow((a: Int, b: Int) => a + b,
(a: Int, b: Int) => a - b, Seconds(60), Seconds(2))
wordCountsOther.checkpoint(Seconds(12)) //窗口滑动间隔的5到10倍
wordCountsOther.print()
ssc.start()
//过滤掉value = 0的值
words.map(x => (x, 1)).reduceByKeyAndWindow((a: Int, b: Int) => a + b,
(a: Int, b: Int) => a - b,
Seconds(30), Seconds(10), 4,
(record: (String, Int)) => record._2 != 0)
//等待Streaming程序终止
ssc.awaitTermination()
}
}

1、分别对rdd2和rdd3进行reduceByKey
2、取在window内的rdd进行union,生成unionRDD
3、对unionRDD再次进行reduceByKey
(不需要 checkpoint机制,不需要依赖)
1、将两个window的所有rdd进行cogroup
(需要依赖前面的RDD,因此需要checkpoint机制)
2、对old rdds对应的value应用invReduceF
3、对new rdds对应的value应用reduceF
localCheckpoint() 存储在内存和磁盘中,但数据不可靠
checkpoint() 存储在HDFS中去,数据可靠,提高容错性能,需要设置文件目录
UpdateStateByKeyAPI
1、updateStateByKey,这个API根据一个key的之前的状态和新的接收到的数据来计算并且更新新状态。使用这个API需要做两步:第一就是为每一个key定义一个初始状态,这个状态的类型可以实任意类型;第二就是定义一个更新状态的函数,这个函数根据每一个key之前的状态和新接收到的数据计算新的状态。
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
object UpdateStateByKeyAPITest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("NetworkWordCount")
val sc = new SparkContext(sparkConf)
// Create the context with a 1 second batch size
val ssc = new StreamingContext(sc, Seconds(1))
ssc.checkpoint("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/checkpoint")
val lines = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordsDStream = words.map(x => (x, 1))
///values: Seq[Int] 在一定的时间段内收到的 当前key在这个时间段内收集到的value,
/// currentState: Option[Int] 当前key的状态
wordsDStream.updateStateByKey(
(values: Seq[Int], currentState: Option[Int]) => Some(currentState.getOrElse(0) + values.sum)).print()
//启动Streaming处理流
ssc.start()
ssc.stop(false)
//updateStateByKey的另一个API
/// 接收的函数是Iterator 三元组 String Key Seq[Int] 接收到的数据 Option[Int]) Key当前的状态
wordsDStream.updateStateByKey[Int]((iter: Iterator[(String, Seq[Int], Option[Int])]) => {
val list = ListBuffer[(String, Int)]()
while (iter.hasNext) {
val (key, newCounts, currentState) = iter.next
val state = Some(currentState.getOrElse(0) + newCounts.sum)
val value = state.getOrElse(0)
if (key.contains("error")) {
list += ((key, value)) // Add only keys with contains error
}
}
list.toIterator
}, new HashPartitioner(4), true).print()
ssc.start()
//等待Streaming程序终止
ssc.awaitTermination()
}
}
MapWithStateAPI
mapWithState,这个API的功能和updateStateByKey是一样的,只不过在性能方面做了很大的优化,这个函数对于没有接收到新数据的key是不会计算新状态的,而updateStateByKey是会重新计算任何的key的新状态的,由于这个原因所以导致mapWithState可以处理的key的数量比updateStateByKey多10倍多,性能也比updateStateByKey快很多。 支持促使状态mapWithState还支持timeout API
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming._
import org.apache.spark.{SparkConf, SparkContext}
object MapWithStateAPITest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("NetworkWordCount")
val sc = new SparkContext(sparkConf)
// Create the context with a 1 second batch size
val ssc = new StreamingContext(sc, Seconds(5))
ssc.checkpoint("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/checkpoint")
val lines = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordsDStream = words.map(x => (x, 1))
val initialRDD = sc.parallelize(List(("dummy", 100L), ("source", 32L)))
// currentBatchTime : 表示当前的Batch的时间
// key: 表示需要更新状态的key
// value: 表示当前batch的对应的key的对应的值
// currentState: 对应key的当前的状态
val stateSpec = StateSpec.function((currentBatchTime: Time, key: String, value: Option[Int], currentState: State[Long]) => {
val sum = value.getOrElse(0).toLong + currentState.getOption.getOrElse(0L)
val output = (key, sum)
if (!currentState.isTimingOut()) {
currentState.update(sum)
}
Some(output)
}).initialState(initialRDD).numPartitions(2).timeout(Seconds(30)) //timeout: 当一个key超过这个时间没有接收到数据的时候,这个key以及对应的状态会被移除掉
val result = wordsDStream.mapWithState(stateSpec)
result.print()
// 从一开始显示所有数据,包含初始值
result.stateSnapshots().print()
//启动Streaming处理流
ssc.start()
ssc.stop(false)
//等待Streaming程序终止
ssc.awaitTermination()
}
}

浙公网安备 33010602011771号