1 package com.xujunqi.spack 2 3 import org.apache.spark.{SparkConf, SparkContext} 4 5 //练习3:交集、并集 6 object JiaojiAndBingji { 7 def main(args: Array[String]): Unit = { 8 //创建sparkconf 9 val sparkconf = new SparkConf().setAppName("JiaojiAndBingji").setMaster("local") 10 11 val sc = new SparkContext(sparkconf) 12 13 val rdd1 = sc.parallelize(List(4, 5, 6, 4)) 14 val rdd2 = sc.parallelize(List(4, 6, 7, 8)) 15 //求并集 16 val rdd3 = rdd1.union(rdd2) 17 //求交集 18 val rdd4 = rdd1.intersection(rdd2) 19 //去重 20 rdd3.distinct.collect() 21 rdd4.collect() 22 23 rdd3.foreach(println(_)) 24 rdd4.foreach(println(_)) 25 26 sc.stop() 27 } 28 29 }
1 package com.xujunqi.spack 2 3 import org.apache.spark.{SparkConf, SparkContext} 4 5 object TestCogroup { 6 def main(args: Array[String]): Unit = { 7 //创建SparkConf对象 8 val sparkConf = new SparkConf().setAppName("TestCogroup").setMaster("local") 9 //创建SparkContext作用域 10 val sc = new SparkContext(sparkConf) 11 12 val rdd01 = sc.parallelize(List(("tom", 1), ("tom", 2), ("jerry", 3), ("kitty", 2))) 13 val rdd02 = sc.parallelize(List(("jerry", 2), ("tom", 1), ("jim", 2))) 14 //Cogroup 15 val rdd03 = rdd01.cogroup(rdd02) 16 rdd03.foreach(println(_)) 17 //注意: congroup 与 groupByKey 的区别 18 rdd03.collect() 19 } 20 21 }
1 package com.xujunqi.spack 2 //练习2:flatMap 3 import org.apache.spark.{SparkConf, SparkContext} 4 5 object TestFlatMap { 6 def main(args: Array[String]): Unit = { 7 val sparkConf = new SparkConf().setAppName("TestFlatMap").setMaster("local") 8 9 val sc = new SparkContext(sparkConf) 10 11 val add1 = sc.parallelize(Array("a,b,c","d,e,f","a,s,d")) 12 //将add1里面的每一个元素先切分在压平 13 val add2 = add1.flatMap(_.split("")) 14 add2.collect() 15 16 add2.foreach(println(_)) 17 } 18 19 }
1 package com.xujunqi.spack 2 3 import org.apache.spark.{SparkConf, SparkContext} 4 //练习4:join、groupByKey 5 object TestJoinAngGroupByKey { 6 def main(args: Array[String]): Unit = { 7 val sparkConf = new SparkConf().setAppName("TestJoinAngGroupByKey").setMaster("local") 8 9 val sc = new SparkContext(sparkConf) 10 11 val rdd01 = sc.parallelize(List(("tom", 1), ("jerry", 2), ("kitty", 3))) 12 val rdd02 = sc.parallelize(List(("jerry", 1), ("tom", 2), ("shuke", 3))) 13 //求join 14 val rdd3 = rdd01.join(rdd02) 15 rdd3.foreach(println(_)) 16 //求并集 17 val rdd04 = rdd01 union (rdd02) 18 rdd04.collect() 19 rdd04.foreach(println(_)) 20 //按照key进行分组 21 val rdd05 = rdd04.groupByKey() 22 rdd05.collect() 23 rdd05.foreach(println(_)) 24 } 25 26 }
1 package com.xujunqi.spack 2 3 import org.apache.spark.{SparkConf, SparkContext} 4 5 //练习1:map、filter 6 object TestMapAndFilter { 7 def main(args: Array[String]): Unit = { 8 val sparkConf = new SparkConf().setAppName("TestMapAndFilter").setMaster("local") 9 10 val sc = new SparkContext(sparkConf) 11 //通过并行化生成rdd 12 val rdd1 = sc.parallelize(List(8, 2, 6, 5, 7, 4, 1, 3, 5, 9)) 13 //对rdd1里的每一个元素乘2 然后排序 14 val rdd2 = rdd1.map(_ * 2).sortBy(x => x, true) 15 //过滤出大于等于5 的元素 16 val rdd3 = rdd2.filter(_ >= 5) 17 //将元素以数组的方式在客户端显示 18 rdd3.collect() 19 // println(rdd3.foreach(_)) 20 rdd3.foreach(println(_)) 21 22 sc.stop() 23 24 } 25 }
1 package com.xujunqi.spack 2 3 import org.apache.spark.{SparkConf, SparkContext} 4 5 //练习6:reduce 6 object TestReduce { 7 def main(args: Array[String]): Unit = { 8 9 val sparkConf = new SparkConf().setAppName("TestReduce").setMaster("local") 10 val sc = new SparkContext(sparkConf) 11 12 /* val rdd1 = sc.parallelize(List(1, 2, 3, 4, 5)) 13 //reduce聚合 14 val rdd2 = rdd1.reduce(_ + _)*/ 15 16 val rdd1 = sc.parallelize(List(1, 2, 3, 4, 5)) 17 //reduce聚合 18 val rdd2 = rdd1.reduce(_ + _) 19 20 rdd1.foreach(println(_)) 21 //rdd2.collect 22 } 23 }
1 package com.xujunqi.spack 2 3 //练习7:reduceByKey、sortByKey 4 5 import org.apache.spark.{SparkConf, SparkContext} 6 7 object TestReduceByKeyAndSortByKey { 8 def main(args: Array[String]): Unit = { 9 val sparkConf = new SparkConf().setAppName("TestReduce").setMaster("local") 10 val sc = new SparkContext(sparkConf) 11 val rdd1 = sc.parallelize(List(("tom", 1), ("jerry", 3), ("kitty", 2), ("shuke", 1))) 12 val rdd2 = sc.parallelize(List(("jerry", 2), ("tom", 3), ("shuke", 2), ("kitty", 5))) 13 14 val rdd3 = rdd1.union(rdd2) 15 //按照key进行聚合 16 val rdd4 = rdd3.reduceByKey(_ + _) 17 rdd4.collect() 18 19 //按照value的降序排序 20 val rdd5 = rdd4.map(t => (t._2, t._1)).sortByKey(false).map(t => (t._2, t._1)) 21 rdd5.foreach(println(_)) 22 rdd5.collect() 23 } 24 25 }
1 package com.xujunqi.spack 2 3 import org.apache.spark.{SparkConf, SparkContext} 4 5 //练习8:repartition、coalesce 6 object TestRepartitionAndCoalesce { 7 def main(args: Array[String]): Unit = { 8 val sparkConf = new SparkConf().setAppName("TestReduce").setMaster("local") 9 val sc = new SparkContext(sparkConf) 10 11 val rdd1 = sc.parallelize(1 to 10, 3) 12 //利用repartition改变rdd1分区数 13 //减少分区 14 rdd1.repartition(2).partitions.size 15 //增加分区 16 rdd1.repartition(4).partitions.size 17 //利用coalesce改变rdd1分区数 18 //减少分区 19 rdd1.coalesce(2).partitions.size 20 //注意:repartition可以增加和减少rdd中的分区数,coalesce只能减少rdd分区数,增加rdd分区数不会生效。 21 } 22 23 }