package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo6GroupByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
.setAppName("map")
.setMaster("local")
//spark 上下文对象
val sc = new SparkContext(conf)
val linesRDD: RDD[String] = sc.textFile("data/words.txt")
val wordsRDD: RDD[String] = linesRDD.flatMap(_.split(","))
//将rdd转换成kv格式
val kvRDD: RDD[(String, Int)] = wordsRDD.map(word => (word, 1))
/**
* groupByKey: 通过key进行分组,将value 放到迭代器中
* groupBy: 指定一个分组的列,
*
* 都会产生shuffle
*/
val groupByKeyRDD: RDD[(String, Iterable[Int])] = kvRDD.groupByKey()
val countyRDD: RDD[(String, Int)] = groupByKeyRDD.map {
case (word: String, values: Iterable[Int]) =>
(word, values.sum)
}
countyRDD.foreach(println)
val groupByRDD: RDD[(String, Iterable[(String, Int)])] = kvRDD.groupBy(kv => kv._1)
}
}
package com.shujia.spark.core
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo7ReduceByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
.setAppName("map")
.setMaster("local")
//spark 上下文对象
val sc = new SparkContext(conf)
val linesRDD: RDD[String] = sc.textFile("data/words.txt")
val wordsRDD: RDD[String] = linesRDD.flatMap(_.split(","))
//将rdd转换成kv格式
val kvRDD: RDD[(String, Int)] = wordsRDD.map(word => (word, 1))
/**
* reduceByKey: 对同一个key的value进行聚合处理
*
*/
val countRDD: RDD[(String, Int)] = kvRDD.reduceByKey((i: Int, j: Int) => i + j)
countRDD.foreach(println)
////简写,如果参数只是用了一次,可以通过下划线代替
val count2: RDD[(String, Int)] = kvRDD.reduceByKey(_ + _)
}
}