8.SparkSQL综合作业
# 创建RDD url = "file:///home/hadoop/sc.txt" rdd = sc.textFile(url).map(lambda x:x.split(",")) scm = rdd.map(lambda a:[a[0],a[1],int(a[2])]) scm.cache() scm.take(3) # 并转换得到DataFrame from pyspark.sql import Row df = spark.createDataFrame(rdd.map(lambda x:Row(name=x[0],course=x[1],grade=int(x[2])))) # 观察 df df.show() df.printSchema() # 注册为临时表 df.createOrReplaceTempView("sc") # 观察临时表 spark.sql("select * from sc").show() # 导包 from pyspark.sql.functions import * # 题目 # 1.总共有多少学生? scm.map(lambda a:a[0]).distinct().count() df.select(df.name).distinct().count() spark.sql("select count(distinct name) from sc").show() # 2.总共开设了哪些课程? scm.map(lambda a:a[1]).distinct().collect() df.select(df.course).distinct().show() spark.sql("select distinct course from sc").show() # 3.每个学生选修了多少门课? name = scm.map(lambda a:(a[0],(a[1],a[2]))) name.countByKey() df.groupBy(df.name).count().show() spark.sql("select name, count(course) from sc group by name").show() # 4.每门课程有多少个学生选? name.values().countByKey() df.groupBy(df.course).count().show() spark.sql("select course, count(name) from sc group by course").show() # 5.每门课程>95分的学生人数 name.values().filter(lambda a:a[1]>95).countByKey() df.filter(df.grade>95).groupBy(df.course).count().show() spark.sql("select course,count(*) from sc where grade>95 group by course").show() # 6.课程'Python'有多少个100分? scm.filter(lambda a:a[1]=='Python').filter(lambda a:a[2]==100).count() df.filter((df.course=='Python') & (df.grade==100)).count() spark.sql("select count(*) from sc where course='Python' and grade=100").show() # 改题目了,这个是旧题目 # 7.Tom选修了几门课?每门课多少分? name.lookup('Tom') len(name.lookup('Tom')) df.select(df.course, df.grade).filter(df.name=='Tom').count() df.select(df.course, df.grade).filter(df.name=='Tom').show() spark.sql("select count(*) from sc where name='Tom'").show() spark.sql("select course, grade from sc where name='Tom'").show() # 7.Tom哪几门课不及格? scm.filter(lambda a:a[0]=='Tom').filter(lambda a:a[2]<60).map(lambda a:a[1]).collect() df.select(df.course).filter((df.name=='Tom')&(df.grade<60)).show() spark.sql("select course from sc where name='Tom' and grade<60").show() # 8.Tom的成绩按分数大小排序。 name.filter(lambda a:a[0]=='Tom').values().sortBy(lambda a:a[1],False).map(lambda a:a[1]).collect() df.select(df.grade).orderBy(df.grade).filter(df.name=='Tom').show() spark.sql("select grade from sc where name='Tom' order by grade asc").show() # 9.Tom选修了哪几门课? scm.filter(lambda a:a[0] == 'Tom').map(lambda a:a[1]).collect() df.filter(df.name == 'Tom').select(df.course).show() spark.sql('select course from sc where name = "Tom"').show() # 10.Tom的平均分。 import numpy as np np.mean(scm.filter(lambda a:a[0]=='Tom').map(lambda a:a[2]).collect()) df.filter(df.name=='Tom').agg({'grade':'mean'}).show() spark.sql("select avg(grade) from sc where name='Tom'").show() # 11.'OperatingSystem'不及格人数 scm.filter(lambda a:a[1] == 'OperatingSystem' and a[2] < 60).count() df.filter(df['course'] == 'OperatingSystem').filter(df['grade'] < 60).count() spark.sql('select count(grade) from sc where course = "OperatingSystem" and grade < 60').show() # 12.'OperatingSystem'平均分 np.mean(scm.filter(lambda a:a[1] == 'OperatingSystem').map(lambda a:(a[1],a[2])).values().collect()) df.filter(df['course'] == 'OperatingSystem').agg({'grade':'avg'}).show() spark.sql('select AVG(grade) from sc where course = "OperatingSystem"').show() # 13.'OperatingSystem'90分以上人数 scm.filter(lambda a:a[1] == 'OperatingSystem' and a[2] > 90).count() df.filter(df['course'] == 'OperatingSystem').filter(df['grade'] > 90).count() spark.sql('select count("OperatingSystem > 90") from sc where course = "OperatingSystem" and grade >90').show() # 14.'OperatingSystem'前3名 scm.filter(lambda a:a[1] == 'OperatingSystem').sortBy(lambda a:a[2],False).map(lambda a:(a[0])).take(3) df.filter(df['course'] == 'OperatingSystem').sort(-df['grade']).select(df.name).limit(3).show() spark.sql('select name from sc where course = "OperatingSystem" order by grade desc limit 3').show() # 15.每个分数按比例+20平时分。 scm.map(lambda a:((a[0],a[1]),a[2])).take(5) scm.map(lambda a:((a[0],a[1]),a[2])).mapValues(lambda v:v*0.8+20).take(5) df.select(df.name, df.course, df.grade).show() df.select(df.name, df.course, df.grade*0.8+20).show() spark.sql("select name, course, grade from sc").show() spark.sql("select name, course, grade*0.8+20 from sc").show() # 16.求每门课的平均分 rdd.map(lambda a:(a[1], (int(a[2]), 1))).reduceByKey(lambda a,b:[a[0]+b[0], a[1]+b[1]]).map(lambda a:[a[0], a[1][0]/a[1][1]]).collect() df.groupBy(df.course).agg({'grade':'avg'}).show() spark.sql('select course, AVG(grade) from sc group by course').show() # 17.选修了7门课的有多少个学生? rdd.map(lambda a:(a[0],
