1. 说明
/*
* 定义 :
* def mapPartitionsWithIndex[U: ClassTag](
* f: (Int, Iterator[T]) => Iterator[U],
* preservesPartitioning: Boolean = false): RDD[U] = withScope
*
* 功能 :
* 对Rdd的分区进行转换,获取 Rdd分区的迭代器和分区编号,返回 处理后的迭代器
*
* */
object mapPartitionsWithIndexTest extends App {
val sparkconf: SparkConf = new SparkConf().setMaster("local").setAppName("mapPartitionsWithIndexTest")
val sc: SparkContext = new SparkContext(sparkconf)
val rdd: RDD[Int] = sc.makeRDD(List(1, -2, 3, 14, 1, -10, -100), 2)
// 对rdd 做绝对值转换
private val absRdd: RDD[Int] = rdd.mapPartitionsWithIndex(
(index, iter) => {
println(s"正在处理 : ${index} 号分区")
iter.map(_.abs)
}
)
println(s"rdd1 :${absRdd.collect().mkString(",")}")
sc.stop()
}
// 处理结果
正在处理 : 0 号分区
正在处理 : 1 号分区
rdd1 :1,2,3,14,1,10,100
需求1 : 过滤 分区
object mapPartitionsWithIndexTestFilterPartion extends App {
val sparkconf: SparkConf = new SparkConf().setMaster("local").setAppName("mapPartitionsWithIndexTest")
val sc: SparkContext = new SparkContext(sparkconf)
val rdd: RDD[Int] = sc.makeRDD(List(1, -2, 3, 14, 1, -10, -100), 2)
// 只保留 0号分区
private val absRdd: RDD[Int] = rdd.mapPartitionsWithIndex(
(index, iter) => {
if (index == 0) iter else Nil.iterator
}
)
println(s"rdd1 :${absRdd.collect().mkString(",")}")
sc.stop()
}
需求2 : 为元素添加所在的分区编号 (分区编号,元素)
object mapPartitionsWithIndexTestAddPartition extends App {
val sparkconf: SparkConf = new SparkConf().setMaster("local").setAppName("mapPartitionsWithIndexTest")
val sc: SparkContext = new SparkContext(sparkconf)
val rdd: RDD[Int] = sc.makeRDD(List(1, -2, 3, 14, 1, -10, -100), 2)
// 为每个元素添加分区编号
private val absRdd = rdd.mapPartitionsWithIndex(
(index, iter) => {
iter.map(
(index, _)
)
}
)
println(s"rdd1 :${absRdd.collect().mkString(",")}")
sc.stop()
}