边看边学Spark——RDD转换操作
在Hdfs中准备两个操作文件:
[zxm@m105 scala-2.10.5]$ hdfs dfs -text /user/zxm/data/data1_less.csv
0,王刚,29,北京市,1991-11-20
1,小米,29,北京市,1991-11-20
2,苹果,29,北京市,1991-11-20
3,三星,29,北京市,1991-11-20
4,公公,29,北京市,1991-11-20
5,中兴,29,北京市,1991-11-20
[zxm@m105 scala-2.10.5]$ hdfs dfs -text /user/zxm/data/data2_less.csv
0,王刚,29,北京市,1991-11-19
1,三星,29,北京市,1991-11-19
2,海尔,29,北京市,1991-11-19
3,金工,29,北京市,1991-11-19
4,工薪,29,北京市,1991-11-19
5,肖东方,29,北京市,1991-11-19
6,吴天杰,29,北京市,1991-11-19
1. Spark 转换:map操作
scala> val text1 = sc.textFile("/user/zxm/data/data1_less.csv")
text1: org.apache.spark.rdd.RDD[String] = /user/zxm/data/data1_less.cssv MapPartitionsRDD[1] at textFile at <console>:21
scala> val text2 = sc.textFile("/user/zxm/data/data2_less.csv")
text2: org.apache.spark.rdd.RDD[String] = /user/zxm/data/data2_less.cssv MapPartitionsRDD[3] at textFile at <console>:21
scala> val mapResult = text1.map(line => line.split(","))
mapResult: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[4] at map at <console>:23
scala> mapResult.collect
res1: Array[Array[String]] = Array(Array(0, 王刚, 29, 北京市, 1991-11-20), Array(1, 小米, 29, 北京市, 1991-11-20), Array(2, 苹果, 29, 北京市, 1991-11-20), Array(3, 三星, 29, 北京市, 1991-11-20), Array(4, 公公, 29, 北京市, 1991-11-20), Array(5, 中兴, 29, 北京市, 1991-11-20))
2. Spark 转换:flatMap操作
scala> val flatMapResult = text1.flatMap(line => line.split(","))
flatMapResult: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[10] at flatMap at <console>:23
scala> flatMapResult.collect
res2: Array[String] = Array(0, 王刚, 29, 北京市, 1991-11-20, 1, 小米, 29, 北京市, 1991-11-20, 2, 苹果, 29, 北京市, 1991-11-20, 3, 三星, 29, 北京市, 1991-11-20, 4, 公公, 29, 北京市, 1991-11-20, 5, 中兴, 29, 北京市, 1991-11-20)
3. Spark 转换:filter操作
scala> val filterResult = text1.filter(line => (line.split(",")(0) == "0"))
filterResult:org.apache.spark.rdd.RDD[String]=MapPartitionsRDD[12]at filter at <console>:23
scala> filterResult.collect
res10: Array[String] = Array(0,王刚,29,北京市,1991-11-19)
4. Spark 转换:union操作
scala> val unionResult = text1 union text2
unionResult: org.apache.spark.rdd.RDD[String] = UnionRDD[17] at union at <console>:25
scala> unionResult.collect
res15: Array[String] = Array(0,王刚,29,北京市,1991-11-20, 1,小米,29,北京市,1991-11-20, 2,苹果,29,北京市,1991-11-20, 3,三星,29,北京市,1991-11-20, 4,公公,29,北京市,1991-11-20, 5,中兴,29,北京市,1991-11-20, 0,王刚,29,北京市,1991-11-19, 1,三星,29,北京市,1991-11-19, 2,海尔,29,北京市,1991-11-19, 3,金工,29,北京市,1991-11-19, 4,工薪,29,北京市,1991-11-19, 5,肖东方,29,北京市,1991-11-19, 6,吴天杰,29,北京市,1991-11-19)
5. Spark 转换:join操作
scala> val rdd1 = text1.map(line => ((line.split(",")(1), line)))
rdd1: org.apache.spark.rdd.RDD[(String, String)]=MapPartitionsRDD[31] at map at <console>:23
scala> val rdd2 = text2.map(line => ((line.split(",")(1), line)))
rdd2: org.apache.spark.rdd.RDD[(String, String)]=MapPartitionsRDD[32] at map at <console>:23
scala> rdd1.collect
res23: Array[(String, String)] = Array((王刚, 0,王刚,29,北京市,1991-11-20), (小米, 1,小米,29,北京市,1991-11-20), (苹果, 2,苹果,29,北京市,1991-11-20), (三星, 3,三星,29,北京市,1991-11-20), (公公, 4,公公,29,北京市,1991-11-20), (中兴, 5,中兴,29,北京市,1991-11-20))
scala> rdd2.collect
res24: Array[(String, String)] = Array((王刚, 0,王刚,29,北京市,1991-11-19), (三星, 1,三星,29,北京市,1991-11-19), (海尔, 2,海尔,29,北京市,1991-11-19), (金工, 3,金工,29,北京市,1991-11-19), (工薪, 4,工薪,29,北京市,1991-11-19), (肖东方, 5,肖东方,29,北京市,1991-11-19), (吴天杰, 6,吴天杰,29,北京市,1991-11-19))
scala> val joinResult = rdd1 join rdd2
joinResult: org.apache.spark.rdd.RDD[(String, (String, String))] = MapPartitionsRDD[42] at join at <console>:29
scala> joinResult.collect
res30: Array[(String, (String, String))] = Array((三星,(3,三星,29,北京市,1991-11-20,1,三星,29,北京市,1991-11-19)), (王刚,(0,王刚,29,北京市,1991-11-20,0,王刚,29,北京市,1991-11-19)))
6. Spark 转换:distinct操作
scala> val rdd1 = sc.parallelize(List(('a',1),('a',1),('b',3),('b',4)))
rdd1: org.apache.spark.rdd.RDD[(Char, Int)] = ParallelCollectionRDD[17] at parallelize at <console>:21
scala> val distinctResult = rdd1 distinct
distinctResult: org.apache.spark.rdd.RDD[(Char, Int)] = MapPartitionsRDD[20] at distinct at <console>:23
scala> distinctResult.collect
res10: Array[(Char, Int)] = Array((b,4), (b,3), (a,1))
7. Spark 转换:intersection
scala> val rdd1 = sc.parallelize(List("abc","bcd","cde"))
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[39] at parallelize at <console>:21
scala> val rdd2 = sc.parallelize(List("cde","def","efg"))
rdd2: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[40] at parallelize at <console>:21
scala> val intersectionResult = rdd1 intersection rdd2
intersectionResult: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[46] at intersection at <console>:25
scala> intersectionResult.collect
res20: Array[String] = Array(cde)
8. Spark 转换:cartesian(笛卡尔集)
scala> val cartesianResult = text1 cartesian text2
cartesianResult: org.apache.spark.rdd.RDD[(String, String)] = CartesianRDD[48] at cartesian at <console>:25
scala> cartesianResult.collect
res25: Array[(String, String)] = Array((0,王刚,29,北京市,1991-11-20,0,王刚,29,北京市,1991-11-19), (0,王刚,29,北京市,1991-11-20,1,三星,29,北京市,1991-11-19), (0,王刚,29,北京市,1991-11-20,2,海尔,29,北京市,1991-11-19), (0,王刚,29,北京市,1991-11-20,3,金工,29,北京市,1991-11-19), (1,小米,29,北京市,1991-11-20,0,王刚,29,北京市,1991-11-19), (1,小米,29,北京市,1991-11-20,1,三星,29,北京市,1991-11-19), (1,小米,29,北京市,1991-11-20,2,海尔,29,北京市,1991-11-19), (1,小米,29,北京市,1991-11-20,3,金工,29,北京市,1991-11-19), (2,苹果,29,北京市,1991-11-20,0,王刚,29,北京市,1991-11-19), (2,苹果,29,北京市,1991-11-20,1,三星,29,北京市,1991-11-19), (2,苹果,29,北京市,1991-11-20,2,海尔,29,北京市,1991-11-19), (2,苹果,29,北京市,1991-11-20,3,金工,29,北京市,1991-11-19), (3,三星,29,北京市,1991-11-20,0,王刚,29,北京市,1991-11-19), (3,三星,29,北京市,1991-11-20,1,三星,29,北京市,1991-11-19), (3,三星,29,北京市,1991-11-20,2,海尔,29,北京市,1991-11-19), (3,三星,29,北京市,1991-11-20...
9. Spark 转换:groupByKey
scala> val rdd1 = sc.parallelize(List(('a',1),('a',1),('b',3),('b',4)))
rdd1: org.apache.spark.rdd.RDD[(Char, Int)] = ParallelCollectionRDD[49] at parallelize at <console>:21
scala> val groupByKeyResult = rdd1 groupByKey
res26: Array[(Char, Iterable[Int])] = Array((b,CompactBuffer(3, 4)), (a,CompactBuffer(1, 1)))
10. Spark 转换:reduceByKey
scala> val rdd1 = sc.parallelize(List(('a',1),('a',1),('b',3),('b',4)))
rdd1: org.apache.spark.rdd.RDD[(Char, Int)] = ParallelCollectionRDD[52] at parallelize at <console>:21
scala> val reduceByKeyResult = rdd1 reduceByKey((a, b) => (a + b))
reduceByKeyResult: org.apache.spark.rdd.RDD[(Char, Int)] = ShuffledRDD[53] at reduceByKey at <console>:23
scala> reduceByKeyResult.collect
res31: Array[(Char, Int)] = Array((b,7), (a,2))
11. Spark 转换:aggregate
scala> def seqOP(a:Int, b:Int) : Int = {
| println("seqOP: " + a + "\t" + b)
| math.max(a,b)
| }
seqOP: (a: Int, b: Int)Int
scala> def combOP(a:Int, b:Int): Int = {
| println("combOP: " + a + "\t" + b)
| math.max(a,b)
| }
combOP: (a: Int, b: Int)Int
scala> val rdd = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:21
scala> rdd.aggregate(0)(seqOP, combOP)
stdout@m105:
seqOP: 0 4
seqOP: 4 5
seqOP: 5 6
stdout@m103
seqOP: 0 1
seqOP: 1 2
seqOP: 2 3
combOP: 0 6
combOP: 6 3
res0: Int = 6
12. Spark 转换:aggregateByKey
scala> var data=sc.parallelize(List((1,4),(1,5),(1,8),(1,5),(2,7),(2,9)))
data: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[13] at parallelize at <console>:21
scala> data.aggregateByKey(0)(seqOP, combOP).collect
stdout@m105
seqOP: 0 5 # key:1
seqOP: 0 7 # key:2
seqOP: 7 9 # key:2
stdout@m103
seqOP: 0 4 # key:1
seqOP: 4 5 # key:1
seqOP: 5 8 # key:1
combOP: 8 5
res5: Array[(Int, Int)] = Array((2,9), (1,8))
aggregate与aggregateByKey有一点不同:aggregate在进行combine时会首先与初始值0比较,然后取出最大值与另一个分区中的最大值比较;aggregateByKey不会再次与初始值比较
13. Spark 转换:sortByKey
scala> val rdd1=sc.parallelize(List((2,"abc"), (1,"cde"), (5, "efg"), (3, "hij")))
rdd1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[4] at parallelize at <console>:21
scala> val sortByKeyResult = rdd1.sortByKey()
sortByKeyResult: org.apache.spark.rdd.RDD[(Int, String)] = ShuffledRDD[7] at sortByKey at <console>:23
scala> sortByKeyResult.collect()
res5: Array[(Int, String)] = Array((1,cde), (2,abc), (3,hij), (5,efg))
scala> val sortByKeyResult = rdd1.sortByKey(ascending=false)
sortByKeyResult: org.apache.spark.rdd.RDD[(Int, String)] = ShuffledRDD[10] at sortByKey at <console>:23
scala> sortByKeyResult.collect()
res6: Array[(Int, String)] = Array((5,efg), (3,hij), (2,abc), (1,cde))
14. Spark 转换:cogroup
scala> val rdd1=sc.parallelize(List((2,"abc"),(1,"cde"),(5, "efg"),(3, "hij")))
rdd1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[15] at parallelize at <console>:21
scala> val rdd2=sc.parallelize(List((2,123), (1,234), (5, 345), (3, 567)))
rdd2: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[16] at parallelize at <console>:21
scala> val cogroupResult = rdd1.cogroup(rdd2)
cogroupResult: org.apache.spark.rdd.RDD[(Int, (Iterable[String], Iterable[Int]))] = MapPartitionsRDD[18] at cogroup at <console>:25
scala> cogroupResult.collect()
res9:Array[(Int,(Iterable[String],Iterable[Int]))]=Array((2,(CompactBuffer(abc),CompactBuffer(123))),(1,(CompactBuffer(cde),CompactBuffer(234))),(3,(CompactBuffer(hij),CompactBuffer(567))), (5,(CompactBuffer(efg),CompactBuffer(345))))

浙公网安备 33010602011771号