手把手教你创建广播变量(broadcast)
1. 使用说明
/* * TODO 使用说明 * 在大表 关联 小表时,可以将小表读取到本地内存(Driver),再作为一个只读变量发分到Executor端读取, * 这样操作避免了shuffle操作,大大提高了join效率 * * */
2.广播变量实现 join
/* 广播变量实现 join */ object Broadcast extends App { private val sc: SparkContext = CommonUtils.getSparkContext("Broadcase") var list: List[(Int, Int)] = List((1, 21), (2, 23), (3, 30), (4, 35)) private val rdd: RDD[(Int, String)] = sc.parallelize(List((1, "张飞"), (2, "赵云"), (3, "关羽"), (4, "奉先"), (5, "子路"))) //声明广播变量 private val bc: Broadcast[Map[Int, Int]] = sc.broadcast(list.toMap) //使用广播变量 实现join private val rdd3: RDD[(Int, (String, Any))] = rdd.map( e => { val map: Map[Int, Int] = bc.value (e._1, (e._2, bc.value.getOrElse(e._1, null))) } ).filter(_._2._2 != null) //使用join算子 private val rdd1: RDD[(Int, Int)] = sc.parallelize(list) private val rdd4: RDD[(Int, (String, Int))] = rdd.join(rdd1) println(rdd3.collect().mkString(",")) println(rdd4.collect().mkString(",")) //(1,(张飞,21)),(2,(赵云,23)),(3,(关羽,30)),(4,(奉先,35)) //(1,(张飞,21)),(2,(赵云,23)),(3,(关羽,30)),(4,(奉先,35)) //sc.stop() while (true) {} }