spark广播变量(demo1,未用广播变量)

分布式rdd跟本地driver关联代码

煎熬的是python采用的是list,scala tuple3,真的痛苦(加油,五年后写scala)

这种方法是每个本地list对象都会被分配到每个分区上使用,也就是一个executor

package com.matthew.bigdata.spark.core

import org.apache.spark.sql.SparkSession

object Demo14 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("demo14").master("local").getOrCreate()
    val sc = spark.sparkContext

    val stu_info_list = List((1, "张大仙", 11), (2, "王晓霞", 13), (3, "张甜甜", 11), (4, "王大力", 11))
    val score_info_rdd = sc.parallelize(List((1, "语文", 99), (2, "数学", 99), (3, "英语", 99), (4, "编程", 99), (1, "语文", 99), (2, "编程", 88),
      (3, "语文", 77), (4, "英语", 73), (1, "语文", 77), (3, "英语", 66), (2, "编程", 99)))

    def map_func(data:Tuple3[Int,String,Int]): Tuple3[String,String,Int] ={
      val id = data._1
      var name=""
      for (elem <- stu_info_list) {
        if (id == elem._1){
          name=elem._2
        }
      }
      return (name,data._2,data._3)
    }
    score_info_rdd.map(map_func).collect().foreach(println)
  }



}
posted @ 2022-07-21 15:01  孤独的执行者  阅读(49)  评论(0)    收藏  举报