Spark RDD 练习

student.txt

12 张小军 25 男 chinese 50
12 张小军 25 男 math 60
12 张小军 25 男 english 70
12 李小凤 20 女 chinese 80
12 李小凤 20 女 math 80
12 李小凤 20 女 english 80
12 王大力 19 男 chinese 70 12 王大力 19 男 math 80 12 王大力 19 男 english 90
13 张大明 25 男 chinese 55 13 张大明 25 男 math 65 13 张大明 25 男 english 60
13 李小华 20 男 chinese 95 13 李小华 20 男 math 92 13 李小华 20 男 english 91
13 王小芳 19 女 chinese 75 13 王小芳 19 女 math 85 13 王小芳 19 女 english 90

 

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.log4j.{Level, Logger}

object App {

  def init(): Unit = {
    //Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
  }


  def main(args: Array[String]): Unit = {
    init()

    val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test1")
    val sc = new SparkContext(conf)

    //spark-shell
    val rdd = sc.textFile("F:\\files\\hdfs_file\\student.txt"); // RDD[String]

    val tuples = rdd.map(line => {
      val fields = line.split(" ")
      val classid_11 = fields(0).toInt //1
      val name_22 = fields(1)
      val age_33 = fields(2).toInt
      val sex_44 = fields(3)
      val course_55 = fields(4)
      val score_66 = fields(5).toDouble

      (classid_11, name_22, age_33, sex_44, course_55, score_66)
    })
    // RDD[(Int, String, Int, String, String, Double)]

    tuples.cache(); //缓存

    println(tuples.collect().toList)
    // List(
    //  (12,张小军,25,男,chinese,50.0), (12,张小军,25,男,math,60.0), (12,张小军,25,男,english,70.0),
    //  (12,李小凤,20,女,chinese,80.0), (12,李小凤,20,女,math,80.0), (12,李小凤,20,女,english,80.0),
    //  (12,王大力,19,男,chinese,70.0), (12,王大力,19,男,math,80.0), (12,王大力,19,男,english,90.0),
    //  (13,张大明,25,男,chinese,55.0), (13,张大明,25,男,math,65.0), (13,张大明,25,男,english,60.0),
    //  (13,李小华,20,男,chinese,95.0), (13,李小华,20,男,math,92.0), (13,李小华,20,男,english,91.0),
    //  (13,王小芳,19,女,chinese,75.0), (13,王小芳,19,女,math,85.0), (13,王小芳,19,女,english,90.0)
    // )

    // (classid_11, name_22, age_33, sex_44, course_55, score_66)


    //多少人参加考试
    val num1 = tuples.map(x => x._2).distinct().count()
    println(num1) //6
    println(tuples.map(x => x._2).distinct()
      .collect().toList)
    // List(张小军, 李小凤, 王小芳, 王大力, 张大明, 李小华)
    println(tuples.map(x => (x._1, x._2)).groupBy(_._2).count()) //6



    //一共多少个 男生 参加考试
    val num2 = tuples.filter(x => x._4 == "男").map(x => (x._2)).distinct().count()
    println(num2) // 4

    println(tuples.filter(_._4 == "男").groupBy(_._2).count()) //4


    //一共有多少个 等于20岁 的人参加考试
    val num3 = tuples.filter(x => x._3 == 20).map(x => (x._2)).distinct().count()
    println(num3) // 2
    println(tuples.filter(x => x._3 == 20).map(x => (x._2)).distinct()
      .collect().toList)
    // List(李小凤, 李小华)

    println(tuples.filter(x => x._3 == 20).map(x => (x._2, x._3))
      .collect().toList)
    // List(
    //  (李小凤,20), (李小凤,20), (李小凤,20),
    //  (李小华,20), (李小华,20), (李小华,20)
    // )
    println(tuples.filter(x => x._3 == 20).map(x => (x._2, x._3)).distinct()
      .collect().toList)
    // List((李小凤,20), (李小华,20))



    //一共有多少个 大于20岁 的人参加考试
    val num4 = tuples.filter(x => x._3 > 20).map(x => (x._2)).distinct().count()
    println(num4) //2

    println(tuples.filter(x => x._3 > 20).map(x => (x._2))
      .collect().toList)
    // List(张小军, 张小军, 张小军, 张大明, 张大明, 张大明)
    println(tuples.filter(x => x._3 > 20).map(x => (x._2)).distinct()
      .collect().toList)
    // List(张小军, 张大明)



    //12班有多少人参加考试
    val num5 = tuples.filter(x => x._1 == 12).map(x => x._2).distinct().count()
    println(num5) //3
    println(tuples.filter(x => x._1 == 12).map(x => x._2).distinct()
      .collect().toList)
    // List(张小军, 李小凤, 王大力)



    //13班有多少个女生参加考试
    val num6 = tuples.filter(x => x._1 == 13 && "女" == x._4).map(x => x._2).distinct().count()
    println(num6) //1
    println(tuples.filter(x => x._1 == 13 && "女" == x._4).map(x => x._2).distinct()
      .collect().toList)
    // List(王小芳)


    //语文科目的平均成绩是多少
    val num7 = tuples.filter(x => "chinese" == x._5).map(x => x._6).mean()
    println(num7) // 70.83333333333333


    //单人平均成绩
    val step1 = tuples
      .map(x => (x._2, x._6))
      .groupByKey()
      .mapValues(x => x.toList)
    println(step1.collect().toList)
    // List(
    //  (张小军,List(50.0, 60.0, 70.0)),
    //  (李小凤,List(80.0, 80.0, 80.0)),
    //  (王小芳,List(75.0, 85.0, 90.0)),
    //  (王大力,List(70.0, 80.0, 90.0)),
    //  (张大明,List(55.0, 65.0, 60.0)),
    //  (李小华,List(95.0, 92.0, 91.0))
    // )

    val step2 = step1.mapValues(x => x.sum / x.size)
    println(step2.collect().toList)
    // List(
    //  (张小军,60.0), (李小凤,80.0), (王小芳,83.33333333333333),
    //  (王大力,80.0), (张大明,60.0), (李小华,92.66666666666667)
    // )



    //12班平均成绩
    val num8 = tuples
      .filter(x => x._1 == 12)
      .map(x => (x._2, x._6))
      .groupByKey()
      .mapValues(x => x.sum / x.size)
    println(num8.collect().toList)
    // List((张小军,60.0), (李小凤,80.0), (王大力,80.0))



    //12班男生的平均成绩
    val num9 = tuples
      .filter(x => x._1 == 12 && "男" == x._4)
      .map(x => (x._2, x._6))
      .groupByKey()
      .mapValues(x => x.sum / x.size)
    println(num9.collect().toList)
    // List((张小军,60.0), (王大力,80.0))



    //13班语文的平均成绩
    val num_10 = tuples
      .filter(x => x._1 == 13 && "chinese" == x._5)
      .map(x => (x._5, x._6))
      .groupByKey()
      .mapValues(x => x.sum / x.size)
    println(num_10.collect().toList)
    // List((chinese,75.0))



    //总成绩大于150的人有多少
    val num_11 = tuples
      .map(x => (x._2, x._6))
      .groupByKey()
      .mapValues(x => x.sum)
      .filter(x => x._2 > 150)
    println(num_11.collect().toList)
    // List(
    //  (张小军,180.0), (李小凤,240.0), (王小芳,250.0),
    //  (王大力,240.0), (张大明,180.0), (李小华,278.0)
    // )
    println(num_11.count()) // 6



    //总成绩大于150 12班男生有多少人
    val num_12 = tuples
      .filter(x => x._1 == 12 && "男" == x._4)
      .map(x => (x._2, x._6))
      .groupByKey()
      .mapValues(x => x.sum)
      .filter(x => x._2 > 150)
    println(num_12.collect().toList)
    // List((张小军,180.0), (王大力,240.0))
    println(num_12.count()) // 2


    tuples.unpersist()
  }
}

 

posted @ 2020-06-01 02:20  茗::流  阅读(153)  评论(0)    收藏  举报
如有雷同,纯属参考。如有侵犯你的版权,请联系我。