Pair RDD操作
1.mapValues / flatMapValues / keys / values,这些操作都可以使用map操作实现,是简化操作
scala> val rdd1 = sc.parallelize(List((1,2),(3,4),(5,6))) rdd1: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:24 scala> val rdd2 = rdd1.mapValues(x => 1 to x) rdd2: org.apache.spark.rdd.RDD[(Int, scala.collection.immutable.Range.Inclusive)] = MapPartitionsRDD[1] at mapValues at <console>:25 scala> rdd2.collect res0: Array[(Int, scala.collection.immutable.Range.Inclusive)] = Array((1,Range 1 to 2), (3,Range 1 to 4), (5,Range 1 to 6)) scala> val rdd2 = rdd1.map(x => (x._1, 1 to x._2)) rdd2: org.apache.spark.rdd.RDD[(Int, scala.collection.immutable.Range.Inclusive)] = MapPartitionsRDD[2] at map at <console>:25 scala> rdd2.collect res1: Array[(Int, scala.collection.immutable.Range.Inclusive)] = Array((1,Range 1 to 2), (3,Range 1 to 4), (5,Range 1 to 6)) scala> val rdd2 = rdd1.map{case (k,v) => (k, 1 to v)} rdd2: org.apache.spark.rdd.RDD[(Int, scala.collection.immutable.Range.Inclusive)] = MapPartitionsRDD[3] at map at <console>:25 scala> rdd2.collect res2: Array[(Int, scala.collection.immutable.Range.Inclusive)] = Array((1,Range 1 to 2), (3,Range 1 to 4), (5,Range 1 to 6)) scala> val rdd3 = rdd1.flatMapValues(x => 1 to x) rdd3: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[4] at flatMapValues at <console>:25 scala> rdd3.collect res3: Array[(Int, Int)] = Array((1,1), (1,2), (3,1), (3,2), (3,3), (3,4), (5,1), (5,2), (5,3), (5,4), (5,5), (5,6)) scala> val rdd3 = rdd1.map(x => (x._1, 1 to x._2)).flatMap{case (k,v) => v.map(x => (k,x))} rdd3: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[6] at flatMap at <console>:25 scala> rdd3.collect res4: Array[(Int, Int)] = Array((1,1), (1,2), (3,1), (3,2), (3,3), (3,4), (5,1), (5,2), (5,3), (5,4), (5,5), (5,6)) scala> rdd3.keys.collect res6: Array[Int] = Array(1, 1, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5) scala> rdd3.values.collect res7: Array[Int] = Array(1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6)
2.groupByKey / reduceByKey / foldByKey / aggregateByKey,聚合操作
scala> val rdd1 = sc.makeRDD(Array(("spark", 12), ("hadoop", 26),("hadoop", 23), ("spark", 15), ("scala", 26), ("spark", 25),("spark", 23), ("hadoop", 16), ("scala", 24), ("spark", 16))) rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[10] at makeRDD at <console>:24 scala> val rdd2 = rdd1.groupByKey() rdd2: org.apache.spark.rdd.RDD[(String, Iterable[Int])] = ShuffledRDD[11] at groupByKey at <console>:25 scala> rdd2.collect res8: Array[(String, Iterable[Int])] = Array((scala,CompactBuffer(24, 26)), (spark,CompactBuffer(12, 15, 25, 23, 16)), (hadoop,CompactBuffer(26, 23, 16))) scala> val rdd2 = rdd1.groupByKey().map(x => (x._1, x._2.sum.toDouble / x._2.size)) rdd2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[13] at map at <console>:25 scala> rdd2.collect res9: Array[(String, Double)] = Array((scala,25.0), (spark,18.2), (hadoop,21.666666666666668)) scala> val rdd2 = rdd1.mapValues((_,1)).reduceByKey((x,y) => (x._1+y._1, x._2+y._2)).mapValues(x => (x._1.toDouble / x._2)) rdd2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[16] at mapValues at <console>:25 scala> rdd2.collect res10: Array[(String, Double)] = Array((scala,25.0), (spark,18.2), (hadoop,21.666666666666668)) scala> val rdd2 = rdd1.mapValues((_,1)).foldByKey((0,0))((x,y) => {(x._1+y._1, x._2+y._2)}).mapValues(x => x._1.toDouble / x._2) rdd2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[19] at mapValues at <console>:25 scala> rdd2.collect res11: Array[(String, Double)] = Array((scala,25.0), (spark,18.2), (hadoop,21.666666666666668)) scala> val rdd2 = rdd1.mapValues((_,1)). | aggregateByKey((0,0))( | (x,y) => (x._1+y._1, x._2+y._2), | (a,b) => (a._1+b._1, a._2+b._2)).mapValues(x => x._1.toDouble / x._2) rdd2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[22] at mapValues at <console>:28 scala> rdd2.collect res12: Array[(String, Double)] = Array((scala,25.0), (spark,18.2), (hadoop,21.666666666666668))
groupByKey没有map端的combiner,Shuffle过程中数据量大,效率低
reduceByKey有map端的combiner,Shuffle过程中数据量小,效率高
3.sortByKey,排序操作
scala> val rdd1 = sc.makeRDD(List("spark","hadoop","scala","hive","java")) rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at makeRDD at <console>:24 scala> val rdd2 = sc.makeRDD(1 to 5) rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at makeRDD at <console>:24 scala> val rdd3 = rdd1.zip(rdd2) rdd3: org.apache.spark.rdd.RDD[(String, Int)] = ZippedPartitionsRDD2[2] at zip at <console>:27 scala> rdd3.collect res0: Array[(String, Int)] = Array((spark,1), (hadoop,2), (scala,3), (hive,4), (java,5)) scala> rdd3.sortByKey().collect res1: Array[(String, Int)] = Array((hadoop,2), (hive,4), (java,5), (scala,3), (spark,1)) scala> rdd3.sortByKey(false).collect res2: Array[(String, Int)] = Array((spark,1), (scala,3), (java,5), (hive,4), (hadoop,2))
4.cogroup / join / leftOuterJoin / rightOuterJoin / fullOuterJoin
scala> val rdd1 = sc.makeRDD(Array((1,"Spark"), (2,"Hadoop"),(3,"Kylin"), (4,"Flink"))) rdd1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[9] at makeRDD at <console>:24 scala> val rdd2 = sc.makeRDD(Array((3,"Tom"), (4,"Tim"), (5,"Jack"),(6,"Marry"))) rdd2: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[10] at makeRDD at <console>:24 scala> val rdd3 = rdd1.cogroup(rdd2) rdd3: org.apache.spark.rdd.RDD[(Int, (Iterable[String], Iterable[String]))] = MapPartitionsRDD[12] at cogroup at <console>:27 scala> rdd3.collect.foreach(println) (4,(CompactBuffer(Flink),CompactBuffer(Tim))) (6,(CompactBuffer(),CompactBuffer(Marry))) (2,(CompactBuffer(Hadoop),CompactBuffer())) (1,(CompactBuffer(Spark),CompactBuffer())) (3,(CompactBuffer(Kylin),CompactBuffer(Tom))) (5,(CompactBuffer(),CompactBuffer(Jack))) scala> val rdd3 = rdd1.join(rdd2) rdd3: org.apache.spark.rdd.RDD[(Int, (String, String))] = MapPartitionsRDD[15] at join at <console>:27 scala> rdd3.collect.foreach(println) (4,(Flink,Tim)) (3,(Kylin,Tom)) scala> val rdd3 = rdd1.leftOuterJoin(rdd2) rdd3: org.apache.spark.rdd.RDD[(Int, (String, Option[String]))] = MapPartitionsRDD[18] at leftOuterJoin at <console>:27 scala> rdd3.collect.foreach(println) (4,(Flink,Some(Tim))) (2,(Hadoop,None)) (1,(Spark,None)) (3,(Kylin,Some(Tom))) scala> val rdd3 = rdd1.rightOuterJoin(rdd2) rdd3: org.apache.spark.rdd.RDD[(Int, (Option[String], String))] = MapPartitionsRDD[21] at rightOuterJoin at <console>:27 scala> rdd3.collect.foreach(println) (4,(Some(Flink),Tim)) (6,(None,Marry)) (3,(Some(Kylin),Tom)) (5,(None,Jack)) scala> val rdd3 = rdd1.fullOuterJoin(rdd2) rdd3: org.apache.spark.rdd.RDD[(Int, (Option[String], Option[String]))] = MapPartitionsRDD[24] at fullOuterJoin at <console>:27 scala> rdd3.collect.foreach(println) (4,(Some(Flink),Some(Tim))) (6,(None,Some(Marry))) (2,(Some(Hadoop),None)) (1,(Some(Spark),None)) (3,(Some(Kylin),Some(Tom))) (5,(None,Some(Jack)))
5.collectAsMap,类似collect
6.countByKey,按key聚合统计数据量
7.lookup(key),高效的查找方法,如果RDD有分区器的话只会查找对应分区的数据

浙公网安备 33010602011771号