spark streaming 缉查布控

package com.shujia.spark.streaming

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Durations, StreamingContext}

object Demo8BlackFilter {
  def main(args: Array[String]): Unit = {
    /**
      * 动态修改广播变量
      *
      */
    val spark: SparkSession = SparkSession
      .builder()
      .appName("black")
      .master("local[2]")
      .getOrCreate()
    import spark.implicits._


    val ssc = new StreamingContext(spark.sparkContext, Durations.seconds(5))
    val kafkaParams: Map[String, Object] = Map[String, Object](
      "bootstrap.servers" -> "master:9092,node1:9092,node2:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "asdasdasdas",
      "auto.offset.reset" -> "latest", //latest:读取新的数据
      "enable.auto.commit" -> "false"
    )

    //topic 列表
    val topics = Array("dianxin")

    val linesDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )

    /**
      * driver 端,只运行一次
      *
      */
    println("foreachRDD外面")

    linesDS.foreachRDD(rdd => {
      /**
        * driver 端,每隔batch会运行一次
        *
        */
      println("foreachRDD内部,算子外部")

      /**
        * 读取黑名单
        *
        */
      val blackListDF: DataFrame = spark.read
        .format("jdbc")
        .option("url", "jdbc:mysql://master:3306")
        .option("dbtable", "student.t_blacklist")
        .option("user", "root")
        .option("password", "123456")
        .load()
      //黑名单的列表
      val blackList: Array[String] = blackListDF.as[String].collect()

      //将黑名单广播
      val broadCastBlackList: Broadcast[Array[String]] = spark.sparkContext.broadcast(blackList)

      val blockRDD: RDD[ConsumerRecord[String, String]] = rdd.filter(record => {
        val value: String = record.value()
        val mdn: String = value.split(",")(0)

        //获取广播变量
        val blackListvalue: Array[String] = broadCastBlackList.value

        blackListvalue.contains(mdn)
      })

      blockRDD.map(_.value()).foreach(println)

      //将数据保存到mysql中
      blockRDD
        .map(_.value())
        .toDF("line")
        .write
        .mode(SaveMode.Append)
        .format("jdbc")
        .option("url", "jdbc:mysql://master:3306")
        .option("dbtable", "student.dianxin_black")
        .option("user", "root")
        .option("password", "123456")
        .save()

      //清除广播变量
      broadCastBlackList.unpersist()

    })

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()

  }
}

 

posted @ 2021-07-25 16:59  坤坤无敌  阅读(55)  评论(0)    收藏  举报