CoalesceOperator

package com.bjsxt.scala.spark.operator

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.ListBuffer

object CoalesceOperator {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("CoalesceOperator").setMaster("local")
    val sc = new SparkContext(sparkConf)
    val dataArr = Array("Angelababy1","Angelababy2","Angelababy3"
				,"Angelababy4","Angelababy5","Angelababy6"
				,"Angelababy7","Angelababy8","Angelababy9"
				,"Angelababy10","Angelababy11","Angelababy12")
    val dataRdd = sc.parallelize(dataArr,3);
    /**
     * mapPartitionsWithIndex这个算子和mapPartitions区别:
     * 	相同点:遍历单位都是partition
     * 	不同点:mapPartitionsWithIndex在遍历的时候能够拿到每一个partition的ID号
     */
    dataRdd.mapPartitionsWithIndex((index,iterator)=>{
      println("partitionId:" + index)
      while(iterator.hasNext){
        println(iterator.next())
      }
      iterator
    }, false).count
//	 第一个参数代表返回的RDD的分区数   第二个参数代表重分区的过程是否产生shuffle
//    val repartitionRDD = dataRdd.repartition(4)
//    val repartitionRDD = dataRdd.coalesce(4, false)
//    val repartitionRDD = dataRdd.coalesce(2, false)
    val repartitionRDD = dataRdd.coalesce(2, true)
    println("coalesceRDD.getNumPartitions:" + repartitionRDD.getNumPartitions)
    repartitionRDD.mapPartitionsWithIndex((index,values) =>{
      println("重分区后partitionId:" + index)
      while(values.hasNext){
        println(values.next())
      }
      values
    }).count()

    sc.stop()
  }
}

  

posted @ 2018-06-18 14:15  uuhh  阅读(93)  评论(0)    收藏  举报