Spark DataFrame 练习

student.txt

12 张小军 25 男 chinese 50
12 张小军 25 男 math 60
12 张小军 25 男 english 70
12 李小凤 20 女 chinese 80
12 李小凤 20 女 math 80
12 李小凤 20 女 english 80
12 王大力 19 男 chinese 70
12 王大力 19 男 math 80
12 王大力 19 男 english 90
13 张大明 25 男 chinese 55
13 张大明 25 男 math 65
13 张大明 25 男 english 60
13 李小华 20 男 chinese 95
13 李小华 20 男 math 92
13 李小华 20 男 english 91
13 王小芳 19 女 chinese 75
13 王小芳 19 女 math 85
13 王小芳 19 女 english 90

 

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}import org.apache.log4j.{Level, Logger}

object App {

  def init(): Unit = {
    //Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
  }


  def main(args: Array[String]): Unit = {
    init()

    val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test")
    val sc = new SparkContext(conf)

    // spark-shell
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val df = sqlContext.createDataFrame(List(
      (12, "张小军", 25, "", "chinese", 50.0), (12, "张小军", 25, "", "math", 60.0), (12, "张小军", 25, "", "english", 70.0),
      (12, "李小凤", 20, "", "chinese", 80.0), (12, "李小凤", 20, "", "math", 80.0), (12, "李小凤", 20, "", "english", 80.0),
      (12, "王大力", 19, "", "chinese", 70.0), (12, "王大力", 19, "", "math", 80.0), (12, "王大力", 19, "", "english", 90.0),
      (13, "张大明", 25, "", "chinese", 55.0), (13, "张大明", 25, "", "math", 65.0), (13, "张大明", 25, "", "english", 60.0),
      (13, "李小华", 20, "", "chinese", 95.0), (13, "李小华", 20, "", "math", 92.0), (13, "李小华", 20, "", "english", 91.0),
      (13, "王小芳", 19, "", "chinese", 75.0), (13, "王小芳", 19, "", "math", 85.0), (13, "王小芳", 19, "", "english", 90.0)
    )).toDF("classid", "name", "age", "sex", "course", "score")

    df.createOrReplaceTempView("cte_student")
    df.cache(); //df.persist(StorageLevel.MEMORY_ONLY)

    df.printSchema()
    /*
root
 |-- classid: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- sex: string (nullable = true)
 |-- course: string (nullable = true)
 |-- score: double (nullable = false)
    * */
df.show() /* +-------+-------+---+---+-------+-----+ |classid| name |age|sex| course|score| +-------+-------+---+---+-------+-----+ | 12| 张小军| 25| 男|chinese| 50.0| | 12| 张小军| 25| 男| math| 60.0| | 12| 张小军| 25| 男|english| 70.0| | 12| 李小凤| 20| 女|chinese| 80.0| | 12| 李小凤| 20| 女| math| 80.0| | 12| 李小凤| 20| 女|english| 80.0| | 12| 王大力| 19| 男|chinese| 70.0| | 12| 王大力| 19| 男| math| 80.0| | 12| 王大力| 19| 男|english| 90.0| | 13| 张大明| 25| 男|chinese| 55.0| | 13| 张大明| 25| 男| math| 65.0| | 13| 张大明| 25| 男|english| 60.0| | 13| 李小华| 20| 男|chinese| 95.0| | 13| 李小华| 20| 男| math| 92.0| | 13| 李小华| 20| 男|english| 91.0| | 13| 王小芳| 19| 女|chinese| 75.0| | 13| 王小芳| 19| 女| math| 85.0| | 13| 王小芳| 19| 女|english| 90.0| +-------+-------+---+----+-------+-----+ * */ //多少人参加考试 val c1 = sqlContext.sql(" select distinct(name) as name from cte_student ") c1.show() /* +-------+ | name| +-------+ | 李小凤| | 张小军| | 李小华| | 王大力| | 王小芳| | 张大明| +-------+ * */ println(c1.count()) // 6 //一共多少个 男生 参加考试 val c2 = sqlContext.sql(" select distinct(name) as name from cte_student where sex='男' ") c2.show() /* +-------+ | name| +-------+ | 张小军| | 李小华| | 王大力| | 张大明| +-------+ * */ println(c2.count()) // 4 //一共有多少个 等于20岁 的人参加考试 sqlContext.sql(" select distinct(name) as name from cte_student where age=20 ").show() /* +-------+ | name| +-------+ | 李小凤| | 李小华| +-------+ * */
//一共有多少个 大于20岁 的人参加考试 sqlContext.sql(" select distinct(name) as name from cte_student where age>20 ").show() /* +-------+ | name| +-------+ | 张小军| | 张大明| +-------+ * */
//12班有多少人参加考试 sqlContext.sql(" select distinct(name) as name from cte_student where classid=12 ").show() /* +-------+ | name| +-------+ | 李小凤| | 张小军| | 王大力| +-------+ * */
//13班有多少个女生参加考试 sqlContext.sql(" select distinct(name) as name from cte_student where classid=13 and sex='女' ").show() /* +-------+ | name| +-------+ | 王小芳| +-------+ * */ //语文科目的平均成绩是多少 sqlContext.sql(" select avg(score) as c from cte_student where course='chinese' ").show() /* +-----------------+ | c| +-----------------+ |70.83333333333333| +-----------------+ * */
//单人平均成绩 sqlContext.sql(" select name, avg(score) as c from cte_student group by name ").show() /* +-------+-----------------+ | name| c| +-------+-----------------+ | 李小凤| 80.0| | 张小军| 60.0| | 李小华|92.66666666666667| | 王大力| 80.0| | 王小芳|83.33333333333333| | 张大明| 60.0| +-------+-----------------+ * */
//12班平均成绩 sqlContext.sql(" select avg(score) as c from cte_student where classid=12 ").show() /* +-----------------+ | c| +-----------------+ |73.33333333333333| +-----------------+ * */
//12班男生的平均成绩 sqlContext.sql( """ |select avg(score) as c |from cte_student |where classid=12 |and sex='男' """.stripMargin).show() /* +----+ | c| +----+ |70.0| +----+ * */
//13班语文的平均成绩 sqlContext.sql( """ |select avg(score) as c |from cte_student |where classid=13 |and course='chinese' """.stripMargin).show() /* +----+ | c| +----+ |75.0| +----+ * */ //总成绩大于150的人有多少 sqlContext.sql( """ |select | name, sum(score) as c |from cte_student |group by name |having sum(score)>150 """.stripMargin).show() /* +-------+-----+ | name| c| +-------+-----+ | 李小凤|240.0| | 张小军|180.0| | 李小华|278.0| | 王大力|240.0| | 王小芳|250.0| | 张大明|180.0| +-------+-----+ * */

//总成绩大于150 12班男生有多少人 sqlContext.sql( """ |select | name, sum(score) as c |from cte_student |where classid=12 |group by name |having sum(score)>150 """.stripMargin).show() /* +-------+-----+ | name| c| +-------+-----+ | 李小凤|240.0| | 张小军|180.0| | 王大力|240.0| +-------+-----+ * */ df.unpersist() } }

 

posted @ 2020-06-01 02:22  茗::流  阅读(254)  评论(0)    收藏  举报
如有雷同,纯属参考。如有侵犯你的版权,请联系我。