Spark RDD 练习
student.txt
12 张小军 25 男 chinese 50 12 张小军 25 男 math 60 12 张小军 25 男 english 70 12 李小凤 20 女 chinese 80 12 李小凤 20 女 math 80 12 李小凤 20 女 english 80
12 王大力 19 男 chinese 70 12 王大力 19 男 math 80 12 王大力 19 男 english 90
13 张大明 25 男 chinese 55 13 张大明 25 男 math 65 13 张大明 25 男 english 60
13 李小华 20 男 chinese 95 13 李小华 20 男 math 92 13 李小华 20 男 english 91
13 王小芳 19 女 chinese 75 13 王小芳 19 女 math 85 13 王小芳 19 女 english 90
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.log4j.{Level, Logger} object App { def init(): Unit = { //Logger.getLogger("org.apache.spark").setLevel(Level.WARN) org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR) } def main(args: Array[String]): Unit = { init() val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test1") val sc = new SparkContext(conf) //spark-shell val rdd = sc.textFile("F:\\files\\hdfs_file\\student.txt"); // RDD[String] val tuples = rdd.map(line => { val fields = line.split(" ") val classid_11 = fields(0).toInt //1 val name_22 = fields(1) val age_33 = fields(2).toInt val sex_44 = fields(3) val course_55 = fields(4) val score_66 = fields(5).toDouble (classid_11, name_22, age_33, sex_44, course_55, score_66) }) // RDD[(Int, String, Int, String, String, Double)] tuples.cache(); //缓存 println(tuples.collect().toList) // List( // (12,张小军,25,男,chinese,50.0), (12,张小军,25,男,math,60.0), (12,张小军,25,男,english,70.0), // (12,李小凤,20,女,chinese,80.0), (12,李小凤,20,女,math,80.0), (12,李小凤,20,女,english,80.0), // (12,王大力,19,男,chinese,70.0), (12,王大力,19,男,math,80.0), (12,王大力,19,男,english,90.0), // (13,张大明,25,男,chinese,55.0), (13,张大明,25,男,math,65.0), (13,张大明,25,男,english,60.0), // (13,李小华,20,男,chinese,95.0), (13,李小华,20,男,math,92.0), (13,李小华,20,男,english,91.0), // (13,王小芳,19,女,chinese,75.0), (13,王小芳,19,女,math,85.0), (13,王小芳,19,女,english,90.0) // ) // (classid_11, name_22, age_33, sex_44, course_55, score_66) //多少人参加考试 val num1 = tuples.map(x => x._2).distinct().count() println(num1) //6 println(tuples.map(x => x._2).distinct() .collect().toList) // List(张小军, 李小凤, 王小芳, 王大力, 张大明, 李小华) println(tuples.map(x => (x._1, x._2)).groupBy(_._2).count()) //6 //一共多少个 男生 参加考试 val num2 = tuples.filter(x => x._4 == "男").map(x => (x._2)).distinct().count() println(num2) // 4 println(tuples.filter(_._4 == "男").groupBy(_._2).count()) //4 //一共有多少个 等于20岁 的人参加考试 val num3 = tuples.filter(x => x._3 == 20).map(x => (x._2)).distinct().count() println(num3) // 2 println(tuples.filter(x => x._3 == 20).map(x => (x._2)).distinct() .collect().toList) // List(李小凤, 李小华) println(tuples.filter(x => x._3 == 20).map(x => (x._2, x._3)) .collect().toList) // List( // (李小凤,20), (李小凤,20), (李小凤,20), // (李小华,20), (李小华,20), (李小华,20) // ) println(tuples.filter(x => x._3 == 20).map(x => (x._2, x._3)).distinct() .collect().toList) // List((李小凤,20), (李小华,20)) //一共有多少个 大于20岁 的人参加考试 val num4 = tuples.filter(x => x._3 > 20).map(x => (x._2)).distinct().count() println(num4) //2 println(tuples.filter(x => x._3 > 20).map(x => (x._2)) .collect().toList) // List(张小军, 张小军, 张小军, 张大明, 张大明, 张大明) println(tuples.filter(x => x._3 > 20).map(x => (x._2)).distinct() .collect().toList) // List(张小军, 张大明) //12班有多少人参加考试 val num5 = tuples.filter(x => x._1 == 12).map(x => x._2).distinct().count() println(num5) //3 println(tuples.filter(x => x._1 == 12).map(x => x._2).distinct() .collect().toList) // List(张小军, 李小凤, 王大力) //13班有多少个女生参加考试 val num6 = tuples.filter(x => x._1 == 13 && "女" == x._4).map(x => x._2).distinct().count() println(num6) //1 println(tuples.filter(x => x._1 == 13 && "女" == x._4).map(x => x._2).distinct() .collect().toList) // List(王小芳) //语文科目的平均成绩是多少 val num7 = tuples.filter(x => "chinese" == x._5).map(x => x._6).mean() println(num7) // 70.83333333333333 //单人平均成绩 val step1 = tuples .map(x => (x._2, x._6)) .groupByKey() .mapValues(x => x.toList) println(step1.collect().toList) // List( // (张小军,List(50.0, 60.0, 70.0)), // (李小凤,List(80.0, 80.0, 80.0)), // (王小芳,List(75.0, 85.0, 90.0)), // (王大力,List(70.0, 80.0, 90.0)), // (张大明,List(55.0, 65.0, 60.0)), // (李小华,List(95.0, 92.0, 91.0)) // ) val step2 = step1.mapValues(x => x.sum / x.size) println(step2.collect().toList) // List( // (张小军,60.0), (李小凤,80.0), (王小芳,83.33333333333333), // (王大力,80.0), (张大明,60.0), (李小华,92.66666666666667) // ) //12班平均成绩 val num8 = tuples .filter(x => x._1 == 12) .map(x => (x._2, x._6)) .groupByKey() .mapValues(x => x.sum / x.size) println(num8.collect().toList) // List((张小军,60.0), (李小凤,80.0), (王大力,80.0)) //12班男生的平均成绩 val num9 = tuples .filter(x => x._1 == 12 && "男" == x._4) .map(x => (x._2, x._6)) .groupByKey() .mapValues(x => x.sum / x.size) println(num9.collect().toList) // List((张小军,60.0), (王大力,80.0)) //13班语文的平均成绩 val num_10 = tuples .filter(x => x._1 == 13 && "chinese" == x._5) .map(x => (x._5, x._6)) .groupByKey() .mapValues(x => x.sum / x.size) println(num_10.collect().toList) // List((chinese,75.0)) //总成绩大于150的人有多少 val num_11 = tuples .map(x => (x._2, x._6)) .groupByKey() .mapValues(x => x.sum) .filter(x => x._2 > 150) println(num_11.collect().toList) // List( // (张小军,180.0), (李小凤,240.0), (王小芳,250.0), // (王大力,240.0), (张大明,180.0), (李小华,278.0) // ) println(num_11.count()) // 6 //总成绩大于150 12班男生有多少人 val num_12 = tuples .filter(x => x._1 == 12 && "男" == x._4) .map(x => (x._2, x._6)) .groupByKey() .mapValues(x => x.sum) .filter(x => x._2 > 150) println(num_12.collect().toList) // List((张小军,180.0), (王大力,240.0)) println(num_12.count()) // 2 tuples.unpersist() } }

浙公网安备 33010602011771号