手把手教你创建广播变量(broadcast)

1. 使用说明

/*
* TODO 使用说明
*     在大表 关联 小表时,可以将小表读取到本地内存(Driver),再作为一个只读变量发分到Executor端读取,
*     这样操作避免了shuffle操作,大大提高了join效率
*
* */

2.广播变量实现 join

/* 广播变量实现 join */
object Broadcast extends App {
  private val sc: SparkContext = CommonUtils.getSparkContext("Broadcase")

  var list: List[(Int, Int)] = List((1, 21), (2, 23), (3, 30), (4, 35))

  private val rdd: RDD[(Int, String)] = sc.parallelize(List((1, "张飞"), (2, "赵云"), (3, "关羽"), (4, "奉先"), (5, "子路")))

  //声明广播变量
  private val bc: Broadcast[Map[Int, Int]] = sc.broadcast(list.toMap)

  //使用广播变量 实现join
  private val rdd3: RDD[(Int, (String, Any))] = rdd.map(
    e => {
      val map: Map[Int, Int] = bc.value
      (e._1, (e._2, bc.value.getOrElse(e._1, null)))
    }
  ).filter(_._2._2 != null)

  //使用join算子
  private val rdd1: RDD[(Int, Int)] = sc.parallelize(list)
  private val rdd4: RDD[(Int, (String, Int))] = rdd.join(rdd1)

  println(rdd3.collect().mkString(","))
  println(rdd4.collect().mkString(","))
  //(1,(张飞,21)),(2,(赵云,23)),(3,(关羽,30)),(4,(奉先,35))
  //(1,(张飞,21)),(2,(赵云,23)),(3,(关羽,30)),(4,(奉先,35))

  //sc.stop()
  while (true) {}
}

 

posted @ 2022-06-20 19:40  学而不思则罔!  阅读(120)  评论(0)    收藏  举报