spark streaming整合kafka
spark streaming整合kafka010版本(Direct方式)
groupId = org.apache.spark
artifactId = spark-streaming-kafka-0-10_2.11
version = 2.1.1
主要步骤:
KafkaUtils.createDirectStream[String,String](ssc,LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](topics, kafkaParams))
val topics = Array("pktest")
val kafkaParams = Map(
"bootstrap.servers" -> "server1:9092,server2:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "directConsumerWC",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "true"
)
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
object kafkaDirectWC2 {
val topics = Array("pktest")
val kafkaParams = Map(
"bootstrap.servers" -> "hadoop102:9092,hadoop103:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "directConsumerWC",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "true"
)
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("kafkaDirectWC2")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") //010版本得设置序列化器
.setMaster("local[*]")
val ssc: StreamingContext = new StreamingContext(conf,Seconds(5))
val input: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics, kafkaParams)
)
input.flatMap(_.value().split(" ")).map((_,1)).reduceByKeyAndWindow(_+_,Duration(20000)).foreachRDD(
one =>{
one.foreach(println(_))
}
)
ssc.start()
ssc.awaitTermination()
}
}
spark streaming整合kafka08版本(Direct方式)
groupId = org.apache.spark
artifactId = spark-streaming-kafka-0-8_2.11
version = 2.1.1
主要步骤:
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
val kafkaParams = Map[String,String]("metadata.broker.list" -> brokers) //broker提交时传参
val topicsSet: Set[String] = topics.split(",").toSet //topics提交时传参
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object kafkaDirectWordCount {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("kafkaDirectWordCount").setMaster("local[2]")
val ssc: StreamingContext = new StreamingContext(conf,Seconds(5))
ssc.sparkContext.setLogLevel("error")
if (args.length != 2) {
System.err.print("Usage:kafkaReceiverWordCount<brokers><topics>")
System.exit(1)
}
val Array(brokers,topics) = args
val kafkaParams = Map[String,String]("metadata.broker.list" -> brokers)
val topicsSet: Set[String] = topics.split(",").toSet
val messages: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
messages.map(_._2).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print()
ssc.start()
ssc.awaitTermination()
}
}
浙公网安备 33010602011771号