package com.bjsxt.scala.spark.operator
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.ListBuffer
object CoalesceOperator {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("CoalesceOperator").setMaster("local")
val sc = new SparkContext(sparkConf)
val dataArr = Array("Angelababy1","Angelababy2","Angelababy3"
,"Angelababy4","Angelababy5","Angelababy6"
,"Angelababy7","Angelababy8","Angelababy9"
,"Angelababy10","Angelababy11","Angelababy12")
val dataRdd = sc.parallelize(dataArr,3);
/**
* mapPartitionsWithIndex这个算子和mapPartitions区别:
* 相同点:遍历单位都是partition
* 不同点:mapPartitionsWithIndex在遍历的时候能够拿到每一个partition的ID号
*/
dataRdd.mapPartitionsWithIndex((index,iterator)=>{
println("partitionId:" + index)
while(iterator.hasNext){
println(iterator.next())
}
iterator
}, false).count
// 第一个参数代表返回的RDD的分区数 第二个参数代表重分区的过程是否产生shuffle
// val repartitionRDD = dataRdd.repartition(4)
// val repartitionRDD = dataRdd.coalesce(4, false)
// val repartitionRDD = dataRdd.coalesce(2, false)
val repartitionRDD = dataRdd.coalesce(2, true)
println("coalesceRDD.getNumPartitions:" + repartitionRDD.getNumPartitions)
repartitionRDD.mapPartitionsWithIndex((index,values) =>{
println("重分区后partitionId:" + index)
while(values.hasNext){
println(values.next())
}
values
}).count()
sc.stop()
}
}