Pair RDD操作

1.mapValues / flatMapValues / keys / values,这些操作都可以使用map操作实现,是简化操作

scala> val rdd1 = sc.parallelize(List((1,2),(3,4),(5,6)))
rdd1: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> val rdd2 = rdd1.mapValues(x => 1 to x)
rdd2: org.apache.spark.rdd.RDD[(Int, scala.collection.immutable.Range.Inclusive)] = MapPartitionsRDD[1] at mapValues at <console>:25

scala> rdd2.collect
res0: Array[(Int, scala.collection.immutable.Range.Inclusive)] = Array((1,Range 1 to 2), (3,Range 1 to 4), (5,Range 1 to 6))

scala> val rdd2 = rdd1.map(x => (x._1, 1 to x._2))
rdd2: org.apache.spark.rdd.RDD[(Int, scala.collection.immutable.Range.Inclusive)] = MapPartitionsRDD[2] at map at <console>:25

scala> rdd2.collect
res1: Array[(Int, scala.collection.immutable.Range.Inclusive)] = Array((1,Range 1 to 2), (3,Range 1 to 4), (5,Range 1 to 6))

scala> val rdd2 = rdd1.map{case (k,v) => (k, 1 to v)}
rdd2: org.apache.spark.rdd.RDD[(Int, scala.collection.immutable.Range.Inclusive)] = MapPartitionsRDD[3] at map at <console>:25

scala> rdd2.collect
res2: Array[(Int, scala.collection.immutable.Range.Inclusive)] = Array((1,Range 1 to 2), (3,Range 1 to 4), (5,Range 1 to 6))

scala> val rdd3 = rdd1.flatMapValues(x => 1 to x)
rdd3: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[4] at flatMapValues at <console>:25

scala> rdd3.collect
res3: Array[(Int, Int)] = Array((1,1), (1,2), (3,1), (3,2), (3,3), (3,4), (5,1), (5,2), (5,3), (5,4), (5,5), (5,6))

scala> val rdd3 = rdd1.map(x => (x._1, 1 to x._2)).flatMap{case (k,v) => v.map(x => (k,x))}
rdd3: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[6] at flatMap at <console>:25

scala> rdd3.collect
res4: Array[(Int, Int)] = Array((1,1), (1,2), (3,1), (3,2), (3,3), (3,4), (5,1), (5,2), (5,3), (5,4), (5,5), (5,6))

scala> rdd3.keys.collect
res6: Array[Int] = Array(1, 1, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5)

scala> rdd3.values.collect
res7: Array[Int] = Array(1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6)

2.groupByKey / reduceByKey / foldByKey / aggregateByKey,聚合操作

scala> val rdd1 = sc.makeRDD(Array(("spark", 12), ("hadoop", 26),("hadoop", 23), ("spark", 15), ("scala", 26), ("spark", 25),("spark", 23), ("hadoop", 16), ("scala", 24), ("spark", 16)))
rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[10] at makeRDD at <console>:24

scala> val rdd2 = rdd1.groupByKey()
rdd2: org.apache.spark.rdd.RDD[(String, Iterable[Int])] = ShuffledRDD[11] at groupByKey at <console>:25

scala> rdd2.collect
res8: Array[(String, Iterable[Int])] = Array((scala,CompactBuffer(24, 26)), (spark,CompactBuffer(12, 15, 25, 23, 16)), (hadoop,CompactBuffer(26, 23, 16)))

scala> val rdd2 = rdd1.groupByKey().map(x => (x._1, x._2.sum.toDouble / x._2.size))
rdd2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[13] at map at <console>:25

scala> rdd2.collect
res9: Array[(String, Double)] = Array((scala,25.0), (spark,18.2), (hadoop,21.666666666666668))

scala> val rdd2 = rdd1.mapValues((_,1)).reduceByKey((x,y) => (x._1+y._1, x._2+y._2)).mapValues(x => (x._1.toDouble / x._2))
rdd2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[16] at mapValues at <console>:25

scala> rdd2.collect
res10: Array[(String, Double)] = Array((scala,25.0), (spark,18.2), (hadoop,21.666666666666668))

scala> val rdd2 = rdd1.mapValues((_,1)).foldByKey((0,0))((x,y) => {(x._1+y._1, x._2+y._2)}).mapValues(x => x._1.toDouble / x._2)
rdd2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[19] at mapValues at <console>:25

scala> rdd2.collect
res11: Array[(String, Double)] = Array((scala,25.0), (spark,18.2), (hadoop,21.666666666666668))

scala> val rdd2 = rdd1.mapValues((_,1)).
     |     aggregateByKey((0,0))(
     |     (x,y) => (x._1+y._1, x._2+y._2),
     |     (a,b) => (a._1+b._1, a._2+b._2)).mapValues(x => x._1.toDouble / x._2)
rdd2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[22] at mapValues at <console>:28

scala> rdd2.collect
res12: Array[(String, Double)] = Array((scala,25.0), (spark,18.2), (hadoop,21.666666666666668))

groupByKey没有map端的combiner,Shuffle过程中数据量大,效率低

reduceByKey有map端的combiner,Shuffle过程中数据量小,效率高

3.sortByKey,排序操作

scala> val rdd1 = sc.makeRDD(List("spark","hadoop","scala","hive","java"))
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at makeRDD at <console>:24

scala> val rdd2 = sc.makeRDD(1 to 5)
rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at makeRDD at <console>:24

scala> val rdd3 = rdd1.zip(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, Int)] = ZippedPartitionsRDD2[2] at zip at <console>:27

scala> rdd3.collect
res0: Array[(String, Int)] = Array((spark,1), (hadoop,2), (scala,3), (hive,4), (java,5))

scala> rdd3.sortByKey().collect
res1: Array[(String, Int)] = Array((hadoop,2), (hive,4), (java,5), (scala,3), (spark,1))

scala> rdd3.sortByKey(false).collect
res2: Array[(String, Int)] = Array((spark,1), (scala,3), (java,5), (hive,4), (hadoop,2))

4.cogroup / join / leftOuterJoin / rightOuterJoin / fullOuterJoin

scala> val rdd1 = sc.makeRDD(Array((1,"Spark"), (2,"Hadoop"),(3,"Kylin"), (4,"Flink")))
rdd1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[9] at makeRDD at <console>:24

scala> val rdd2 = sc.makeRDD(Array((3,"Tom"), (4,"Tim"), (5,"Jack"),(6,"Marry")))
rdd2: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[10] at makeRDD at <console>:24

scala> val rdd3 = rdd1.cogroup(rdd2)
rdd3: org.apache.spark.rdd.RDD[(Int, (Iterable[String], Iterable[String]))] = MapPartitionsRDD[12] at cogroup at <console>:27

scala> rdd3.collect.foreach(println)
(4,(CompactBuffer(Flink),CompactBuffer(Tim)))
(6,(CompactBuffer(),CompactBuffer(Marry)))
(2,(CompactBuffer(Hadoop),CompactBuffer()))
(1,(CompactBuffer(Spark),CompactBuffer()))
(3,(CompactBuffer(Kylin),CompactBuffer(Tom)))
(5,(CompactBuffer(),CompactBuffer(Jack)))

scala> val rdd3 = rdd1.join(rdd2)
rdd3: org.apache.spark.rdd.RDD[(Int, (String, String))] = MapPartitionsRDD[15] at join at <console>:27

scala> rdd3.collect.foreach(println)
(4,(Flink,Tim))
(3,(Kylin,Tom))

scala> val rdd3 = rdd1.leftOuterJoin(rdd2)
rdd3: org.apache.spark.rdd.RDD[(Int, (String, Option[String]))] = MapPartitionsRDD[18] at leftOuterJoin at <console>:27

scala> rdd3.collect.foreach(println)
(4,(Flink,Some(Tim)))
(2,(Hadoop,None))
(1,(Spark,None))
(3,(Kylin,Some(Tom)))

scala> val rdd3 = rdd1.rightOuterJoin(rdd2)
rdd3: org.apache.spark.rdd.RDD[(Int, (Option[String], String))] = MapPartitionsRDD[21] at rightOuterJoin at <console>:27

scala> rdd3.collect.foreach(println)
(4,(Some(Flink),Tim))
(6,(None,Marry))
(3,(Some(Kylin),Tom))
(5,(None,Jack))

scala> val rdd3 = rdd1.fullOuterJoin(rdd2)
rdd3: org.apache.spark.rdd.RDD[(Int, (Option[String], Option[String]))] = MapPartitionsRDD[24] at fullOuterJoin at <console>:27

scala> rdd3.collect.foreach(println)
(4,(Some(Flink),Some(Tim)))
(6,(None,Some(Marry)))
(2,(Some(Hadoop),None))
(1,(Some(Spark),None))
(3,(Some(Kylin),Some(Tom)))
(5,(None,Some(Jack)))

5.collectAsMap,类似collect

6.countByKey,按key聚合统计数据量

7.lookup(key),高效的查找方法,如果RDD有分区器的话只会查找对应分区的数据

posted @ 2022-03-16 14:15  NeilCheung514  阅读(60)  评论(0)    收藏  举报