spark streaming整合kafka

spark streaming整合kafka010版本(Direct方式)

groupId = org.apache.spark
artifactId = spark-streaming-kafka-0-10_2.11
version = 2.1.1

主要步骤:

  KafkaUtils.createDirectStream[String,String](ssc,LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](topics, kafkaParams))

  val topics = Array("pktest")

val kafkaParams = Map(
"bootstrap.servers" -> "server1:9092,server2:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "directConsumerWC",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "true"
)
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}


object kafkaDirectWC2 {

    val topics = Array("pktest")

    val kafkaParams = Map(
        "bootstrap.servers" -> "hadoop102:9092,hadoop103:9092",
        "key.deserializer" -> classOf[StringDeserializer],
        "value.deserializer" -> classOf[StringDeserializer],
        "group.id" -> "directConsumerWC",
        "auto.offset.reset" -> "earliest",
        "enable.auto.commit" -> "true"
    )

    def main(args: Array[String]): Unit = {


        val conf: SparkConf = new SparkConf().setAppName("kafkaDirectWC2")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") //010版本得设置序列化器
                .setMaster("local[*]")

        val ssc: StreamingContext = new StreamingContext(conf,Seconds(5))


        val input: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc,
            LocationStrategies.PreferConsistent,
            ConsumerStrategies.Subscribe[String,String](topics, kafkaParams)

        )
        input.flatMap(_.value().split(" ")).map((_,1)).reduceByKeyAndWindow(_+_,Duration(20000)).foreachRDD(
            one =>{
                one.foreach(println(_))
            }
        )
        ssc.start()
        ssc.awaitTermination()
    }

}

 


 

spark streaming整合kafka08版本(Direct方式)

groupId = org.apache.spark
artifactId = spark-streaming-kafka-0-8_2.11
version = 2.1.1

主要步骤:

  KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)

  val kafkaParams = Map[String,String]("metadata.broker.list" -> brokers)      //broker提交时传参

  val topicsSet: Set[String] = topics.split(",").toSet          //topics提交时传参

import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object kafkaDirectWordCount {
    def main(args: Array[String]): Unit = {
        val conf: SparkConf = new SparkConf().setAppName("kafkaDirectWordCount").setMaster("local[2]")

        val ssc: StreamingContext = new StreamingContext(conf,Seconds(5))
        ssc.sparkContext.setLogLevel("error")
        if (args.length != 2) {
            System.err.print("Usage:kafkaReceiverWordCount<brokers><topics>")
            System.exit(1)

        }

        val Array(brokers,topics) = args
        val kafkaParams = Map[String,String]("metadata.broker.list" -> brokers)
        val topicsSet: Set[String] = topics.split(",").toSet

        val messages: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)


        messages.map(_._2).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print()

        ssc.start()
        ssc.awaitTermination()
    }
}

  

posted @ 2019-12-08 21:41  手写伪代码  阅读(600)  评论(0编辑  收藏  举报