package com.atguigu.bigata.spark.core.rdd.builder.operator.transform
import org.apache.spark.{SparkConf, SparkContext}
/**
* @auther :${user}
* @date :2022/2/14 23:36
*
*/
object spark017_RDD_Operator_transform_AggregateByKey {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Operator")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List(("a", 2), ("a", 1), ("b", 3), ("b", 4), ("b", 5), ("b", 6)), 2)
/*
第一个 参数将相同key的数据进行一个结构的转换
第二个 参数 分区内的计算规则
第三个 参数分区间的计算规则*/
rdd.combineByKey(
value => (value, 1),
(tuple: (Int, Int), value) => {
(tuple._1 + value, tuple._2 + 1)
},
(tuple1: (Int, Int), tuple2: (Int, Int)) => {
(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2)
}
).mapValues {
case (cnt, num) => {
cnt / num
}
}.collect.foreach(println)
/*
1.分组内和分组外计算规则不相同,函数柯里化,第一个参数列表(初始值)用于比较,第二个参数列表(分区内计算规则,分区之间计算规则)
aggregateByKey
rdd.aggregateByKey(0)(Math.max,_+_).collect.foreach(println)
2.分组内和分组外计算规则相同
val rddFoldByKey = rdd.foldByKey(0)(_+_)
*/
rdd.aggregateByKey((0, 0))(
(t, v) => {
(t._1 + v, t._2 + 1)
},
(t1, t2) => {
(t1._1 + t2._1, t1._2 + t2._2)
}
).mapValues {
case (cnt, num) => cnt / num
}.collect.foreach(println)
println("=====================")
//WordCount01 reduceByKey
rdd.reduceByKey(_ + _).collect.foreach(println)
println("=======================")
//WordCount02 foldByKey
rdd.foldByKey(0)(_ + _).foreach(println)
println("=======================")
//WordCount03 aggregateByKey
rdd.aggregateByKey(0)(_ + _, _ + _).collect.foreach(println)
println("=========================")
//WordCount04 combineByKey
rdd.combineByKey(v => v, (v1: Int, v2) => v1 + v2, (v1: Int, v2: Int) => v1 + v2).collect.foreach(println)
sc.stop()
}
}