package com.shujia.spark.streaming
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.{Durations, StreamingContext}
object Demo2StreamOnRDD {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("streaming")
.master("local[2]")
.config("spark.sql.shuffle.partitions", 1)
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
val sc: SparkContext = spark.sparkContext
/**
* 创建streaming 上下文对象,指定batch的间隔时间,多久计算一次
*
*/
val ssc = new StreamingContext(sc, Durations.seconds(5))
val linesDS: ReceiverInputDStream[String] = ssc.socketTextStream("master", 8888)
/**
* foreachRDD:将DS转换成RDD使用,可以使用 rdd 的 api
*
*/
linesDS.foreachRDD(rdd => {
/**
* 每个batch计算一次,不能做全局的计算
*
*/
//使用rdd api
rdd.flatMap(_.split(","))
.map((_, 1))
.reduceByKey(_ + _)
// .foreach(println)
val lineDF: DataFrame = rdd.toDF("lines")
lineDF
.select(explode(split($"lines", ",")) as "word")
.groupBy($"word")
.agg(count($"word") as "C")
// .show()
lineDF.createOrReplaceTempView("words")
spark.sql(
"""
|
|select word,count(1) from (
|select explode(split(lines,',')) as word from words
|) as a
|group by word
|
""".stripMargin)
.show()
})
//启动streaming
ssc.start()
ssc.awaitTermination() //等待关闭 这三行代码必须要写
ssc.stop()
}
}