Spark DataFrame 练习
student.txt
12 张小军 25 男 chinese 50 12 张小军 25 男 math 60 12 张小军 25 男 english 70 12 李小凤 20 女 chinese 80 12 李小凤 20 女 math 80 12 李小凤 20 女 english 80 12 王大力 19 男 chinese 70 12 王大力 19 男 math 80 12 王大力 19 男 english 90 13 张大明 25 男 chinese 55 13 张大明 25 男 math 65 13 张大明 25 男 english 60 13 李小华 20 男 chinese 95 13 李小华 20 男 math 92 13 李小华 20 男 english 91 13 王小芳 19 女 chinese 75 13 王小芳 19 女 math 85 13 王小芳 19 女 english 90
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SQLContext, SparkSession}import org.apache.log4j.{Level, Logger} object App { def init(): Unit = { //Logger.getLogger("org.apache.spark").setLevel(Level.WARN) org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR) } def main(args: Array[String]): Unit = { init() val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test") val sc = new SparkContext(conf) // spark-shell val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val df = sqlContext.createDataFrame(List( (12, "张小军", 25, "男", "chinese", 50.0), (12, "张小军", 25, "男", "math", 60.0), (12, "张小军", 25, "男", "english", 70.0), (12, "李小凤", 20, "女", "chinese", 80.0), (12, "李小凤", 20, "女", "math", 80.0), (12, "李小凤", 20, "女", "english", 80.0), (12, "王大力", 19, "男", "chinese", 70.0), (12, "王大力", 19, "男", "math", 80.0), (12, "王大力", 19, "男", "english", 90.0), (13, "张大明", 25, "男", "chinese", 55.0), (13, "张大明", 25, "男", "math", 65.0), (13, "张大明", 25, "男", "english", 60.0), (13, "李小华", 20, "男", "chinese", 95.0), (13, "李小华", 20, "男", "math", 92.0), (13, "李小华", 20, "男", "english", 91.0), (13, "王小芳", 19, "女", "chinese", 75.0), (13, "王小芳", 19, "女", "math", 85.0), (13, "王小芳", 19, "女", "english", 90.0) )).toDF("classid", "name", "age", "sex", "course", "score") df.createOrReplaceTempView("cte_student") df.cache(); //df.persist(StorageLevel.MEMORY_ONLY) df.printSchema() /* root |-- classid: integer (nullable = false) |-- name: string (nullable = true) |-- age: integer (nullable = false) |-- sex: string (nullable = true) |-- course: string (nullable = true) |-- score: double (nullable = false) * */
df.show() /* +-------+-------+---+---+-------+-----+ |classid| name |age|sex| course|score| +-------+-------+---+---+-------+-----+ | 12| 张小军| 25| 男|chinese| 50.0| | 12| 张小军| 25| 男| math| 60.0| | 12| 张小军| 25| 男|english| 70.0| | 12| 李小凤| 20| 女|chinese| 80.0| | 12| 李小凤| 20| 女| math| 80.0| | 12| 李小凤| 20| 女|english| 80.0| | 12| 王大力| 19| 男|chinese| 70.0| | 12| 王大力| 19| 男| math| 80.0| | 12| 王大力| 19| 男|english| 90.0| | 13| 张大明| 25| 男|chinese| 55.0| | 13| 张大明| 25| 男| math| 65.0| | 13| 张大明| 25| 男|english| 60.0| | 13| 李小华| 20| 男|chinese| 95.0| | 13| 李小华| 20| 男| math| 92.0| | 13| 李小华| 20| 男|english| 91.0| | 13| 王小芳| 19| 女|chinese| 75.0| | 13| 王小芳| 19| 女| math| 85.0| | 13| 王小芳| 19| 女|english| 90.0| +-------+-------+---+----+-------+-----+ * */ //多少人参加考试 val c1 = sqlContext.sql(" select distinct(name) as name from cte_student ") c1.show() /* +-------+ | name| +-------+ | 李小凤| | 张小军| | 李小华| | 王大力| | 王小芳| | 张大明| +-------+ * */ println(c1.count()) // 6 //一共多少个 男生 参加考试 val c2 = sqlContext.sql(" select distinct(name) as name from cte_student where sex='男' ") c2.show() /* +-------+ | name| +-------+ | 张小军| | 李小华| | 王大力| | 张大明| +-------+ * */ println(c2.count()) // 4 //一共有多少个 等于20岁 的人参加考试 sqlContext.sql(" select distinct(name) as name from cte_student where age=20 ").show() /* +-------+ | name| +-------+ | 李小凤| | 李小华| +-------+ * */
//一共有多少个 大于20岁 的人参加考试 sqlContext.sql(" select distinct(name) as name from cte_student where age>20 ").show() /* +-------+ | name| +-------+ | 张小军| | 张大明| +-------+ * */
//12班有多少人参加考试 sqlContext.sql(" select distinct(name) as name from cte_student where classid=12 ").show() /* +-------+ | name| +-------+ | 李小凤| | 张小军| | 王大力| +-------+ * */
//13班有多少个女生参加考试 sqlContext.sql(" select distinct(name) as name from cte_student where classid=13 and sex='女' ").show() /* +-------+ | name| +-------+ | 王小芳| +-------+ * */ //语文科目的平均成绩是多少 sqlContext.sql(" select avg(score) as c from cte_student where course='chinese' ").show() /* +-----------------+ | c| +-----------------+ |70.83333333333333| +-----------------+ * */
//单人平均成绩 sqlContext.sql(" select name, avg(score) as c from cte_student group by name ").show() /* +-------+-----------------+ | name| c| +-------+-----------------+ | 李小凤| 80.0| | 张小军| 60.0| | 李小华|92.66666666666667| | 王大力| 80.0| | 王小芳|83.33333333333333| | 张大明| 60.0| +-------+-----------------+ * */
//12班平均成绩 sqlContext.sql(" select avg(score) as c from cte_student where classid=12 ").show() /* +-----------------+ | c| +-----------------+ |73.33333333333333| +-----------------+ * */
//12班男生的平均成绩 sqlContext.sql( """ |select avg(score) as c |from cte_student |where classid=12 |and sex='男' """.stripMargin).show() /* +----+ | c| +----+ |70.0| +----+ * */
//13班语文的平均成绩 sqlContext.sql( """ |select avg(score) as c |from cte_student |where classid=13 |and course='chinese' """.stripMargin).show() /* +----+ | c| +----+ |75.0| +----+ * */ //总成绩大于150的人有多少 sqlContext.sql( """ |select | name, sum(score) as c |from cte_student |group by name |having sum(score)>150 """.stripMargin).show() /* +-------+-----+ | name| c| +-------+-----+ | 李小凤|240.0| | 张小军|180.0| | 李小华|278.0| | 王大力|240.0| | 王小芳|250.0| | 张大明|180.0| +-------+-----+ * */
//总成绩大于150 12班男生有多少人 sqlContext.sql( """ |select | name, sum(score) as c |from cte_student |where classid=12 |group by name |having sum(score)>150 """.stripMargin).show() /* +-------+-----+ | name| c| +-------+-----+ | 李小凤|240.0| | 张小军|180.0| | 王大力|240.0| +-------+-----+ * */ df.unpersist() } }

浙公网安备 33010602011771号