zipOperator

package com.bjsxt.scala.spark.high.operator

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.ListBuffer

object zipOperator {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("JoinOperator")
      .set("spark.rdd.compress", "true")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(1 to 10,2)   // 1 2 3 4 5 10
    val rdd2 = sc.makeRDD(2 to 11,2)// 1 2 3 4 ..9
    /**
     * 1、分区数需要相同
     * 2、分区中的元素个数相等
     */
    val rdd3 = rdd1.zip(rdd2)
    /**
     * (1,2)
     * (2,3)
     * ....
     * (10,11)
     */
    rdd3.foreach(println)
    /**
     * zipWithIndex 将RDD变成KV格式的RDD   K:这个RDD的元素  V:这个元素在RDD中的索引
      * 2-11
      * (2,0)
      * (3,1)
     */
    val zipWithIndexRdd = rdd2.zipWithIndex()
    zipWithIndexRdd.foreach(println)
    
    /**
     * 将非KV格式的RDD变成KV格式的RDD  k:这个RDD的元素     V:第一个分区的第一个元素0+1    1+1=2 

      2-11
      2-6 partition0
      (2,0)
      (3,0+2)
      .....
      7-11 partition1
      (7,1)
      (8,1+2)
      ....
      **/
    rdd2.mapPartitionsWithIndex((index,iterator)=>{
       val list = new ListBuffer[Int]()
        while (iterator.hasNext) {
          val v = iterator.next()
          println("partition Id:"+index +"\tvalue:"+v)
          list.+=(v)
        }
       list.iterator
    }, false).count()
    
    val zipWithUnipeIdRDD = rdd2.zipWithUniqueId()
    zipWithUnipeIdRDD.foreach(println)
    while(true){}
  }
}

  

posted @ 2018-06-18 14:07  uuhh  阅读(98)  评论(0)    收藏  举报