1 package com.bawei.foryk 2 3 import org.apache.kafka.clients.consumer.ConsumerRecord 4 import org.apache.kafka.common.serialization.StringDeserializer 5 import org.apache.spark.streaming.dstream.{DStream, InputDStream} 6 import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 7 import org.apache.spark.streaming.kafka010.KafkaUtils 8 import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 9 import org.apache.spark.{SparkConf, SparkContext} 10 import org.apache.spark.streaming.{Seconds, StreamingContext} 11 12 15 object SparkStreamReview01 { 16 18 19 def main(args: Array[String]): Unit = { 20 var checkpointdir = "./checkdir2" 21 StreamingContext.getOrCreate(checkpointdir,()=>{ 22 createFunc(checkpointdir) 23 }) 24 } 25 26 def createFunc(checkpointdir:String): StreamingContext = { 27 28 val conf: SparkConf = new SparkConf().setAppName("SparkStreamReview01").setMaster("local[2]") 29 val sc = new SparkContext(conf) 30 31 sc.setLogLevel("WARN") 32 val ssc = new StreamingContext(sc,Seconds(5)) 33 ssc.checkpoint(checkpointdir) 34 35 36 val kafkaParams = Map[String, Object]( 37 "bootstrap.servers" -> "192.168.182.147:9092,192.168.182.148:9092,192.168.182.149:9092", 38 "key.deserializer" -> classOf[StringDeserializer], 39 "value.deserializer" -> classOf[StringDeserializer], 40 "group.id" -> "group1" 41 ) 42 //5、定义一个topics ,是一个集合,可以存放多个topic 43 val topics=Set("test") 44 //6、利用KafkaUtils.createDirectStream构建Dstream 45 val kafkaTopicDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,PreferConsistent,Subscribe[String, String](topics, kafkaParams)) 46 //获取kafka中topic的数据 47 val socketline: DStream[String] = kafkaTopicDS.map(x=>x.value()) 48 49 val mapRDD: DStream[(String, Int)] = socketline.flatMap(_.split(" ")).map((_,1)) 50 51 //mapRDD.reduceByKey(_+_).print() 52 //mapRDD.reduceByKeyAndWindow((x:Int,y:Int)=>x+y,Seconds(10),Seconds(5)).print() 53 //mapRDD.countByValueAndWindow(Seconds(10),Seconds(5)).print() 54 55 val result: DStream[(String, Int)] = mapRDD.updateStateByKey((list: Seq[Int], option: Option[Int]) => { 56 // 57 var before = option.getOrElse(0) //获取上一次的累加结果 58 for (value <- list) { 59 before += value 60 } 61 Option(before) 62 }) 63 result.print() 64 65 ssc.start() 66 ssc.awaitTermination() 67 ssc 68 } 69 70 }
5.1 窗口函数(窗口时间,滑动时间:reduceByKeyAndWindow,countByKeyAndWindow) 5.2 获得StreamingContext对象的方式 5.3 updateStateByKey 5.4 sparkstreaming + kafka 5.5 输出(输出到mysql)