1 package com.bawei.foryk
 2  
 3 import org.apache.kafka.clients.consumer.ConsumerRecord
 4 import org.apache.kafka.common.serialization.StringDeserializer
 5 import org.apache.spark.streaming.dstream.{DStream, InputDStream}
 6 import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
 7 import org.apache.spark.streaming.kafka010.KafkaUtils
 8 import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
 9 import org.apache.spark.{SparkConf, SparkContext}
10 import org.apache.spark.streaming.{Seconds, StreamingContext}
11  
12 
15 object SparkStreamReview01 {
16  
18  
19   def main(args: Array[String]): Unit = {
20     var checkpointdir = "./checkdir2"
21     StreamingContext.getOrCreate(checkpointdir,()=>{
22       createFunc(checkpointdir)
23     })
24   }
25  
26   def createFunc(checkpointdir:String): StreamingContext = {
27  
28     val conf: SparkConf = new SparkConf().setAppName("SparkStreamReview01").setMaster("local[2]")
29     val sc = new SparkContext(conf)
30  
31     sc.setLogLevel("WARN")
32     val ssc = new StreamingContext(sc,Seconds(5))
33     ssc.checkpoint(checkpointdir)
34  
35  
36     val kafkaParams = Map[String, Object](
37       "bootstrap.servers" -> "192.168.182.147:9092,192.168.182.148:9092,192.168.182.149:9092",
38       "key.deserializer" -> classOf[StringDeserializer],
39       "value.deserializer" -> classOf[StringDeserializer],
40       "group.id" -> "group1"
41     )
42     //5、定义一个topics ,是一个集合,可以存放多个topic
43     val topics=Set("test")
44     //6、利用KafkaUtils.createDirectStream构建Dstream
45     val kafkaTopicDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,PreferConsistent,Subscribe[String, String](topics, kafkaParams))
46     //获取kafka中topic的数据
47     val socketline: DStream[String] = kafkaTopicDS.map(x=>x.value())
48  
49     val mapRDD: DStream[(String, Int)] = socketline.flatMap(_.split(" ")).map((_,1))
50  
51     //mapRDD.reduceByKey(_+_).print()
52     //mapRDD.reduceByKeyAndWindow((x:Int,y:Int)=>x+y,Seconds(10),Seconds(5)).print()
53     //mapRDD.countByValueAndWindow(Seconds(10),Seconds(5)).print()
54  
55     val result: DStream[(String, Int)] = mapRDD.updateStateByKey((list: Seq[Int], option: Option[Int]) => {
56       //
57       var before = option.getOrElse(0) //获取上一次的累加结果
58       for (value <- list) {
59         before += value
60       }
61       Option(before)
62     })
63     result.print()
64  
65     ssc.start()
66     ssc.awaitTermination()
67     ssc
68   }
69  
70 }
​ 5.1 窗口函数(窗口时间,滑动时间:reduceByKeyAndWindow,countByKeyAndWindow)

​ 5.2 获得StreamingContext对象的方式

​ 5.3 updateStateByKey

​ 5.4 sparkstreaming + kafka

​ 5.5 输出(输出到mysql)