8. SparkSQL综合作业
## 导入本次实验所需要用到的包
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
import pyspark.sql.functions as f
# 生成lines RDD
lines=sc.textFile('file:///home/ywm/sc.txt')
lines.take(1)
# 生成df_ncs DataFrame
df_ncs=spark.createDataFrame(lines.map(lambda l:l.split(',')).map(lambda l:Row(name=l[0],course=l[1],score=int(l[2]))))
df_ncs.printSchema()
df_ncs.first()
# 生成sql_ncs 临时视图(可以注册临时表之类的也可以用spark sql操作)
df_ncs.createOrReplaceTempView('sql_ncs')
df_ncs.first()
 
#总共有多少学生?
lines.map(lambda l:l.split(',')).map(lambda l:l[0]).distinct().count()
df_ncs.select('name').distinct().count()
spark.sql("select count(distinct name) from sql_ncs").show()
 
#总共开设了多少门课程?
lines.map(lambda l:l.split(',')).map(lambda l:l[1]).distinct().count()
df_ncs.select('course').distinct().count()
spark.sql("select count(distinct course) from sql_ncs").show()
 
#每个学生选修了多少门课?
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],l[1])).countByKey()
df_ncs.groupBy('name').count().head(2)
spark.sql("select name,count(1) from sql_ncs group by name").show()
 
#每门课程有多少个学生选?
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],l[0])).countByKey()
df_ncs.groupBy('course').count().head(3)
spark.sql("select course, count(1) from sql_ncs group by course").show()
 
#每门课程>95分的学生人数
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],l[2])).filter(lambda l:(int(l[1])>95)).countByKey()
df_ncs.filter(df_ncs['score'] > 95).groupBy('course').count().head(2)
spark.sql("select course, count(1) from sql_ncs where score>95 group by course").show()
 
#课程'Python'有多少个100分?
lines.map(lambda l:l.split(',')).filter(lambda l:((l[1]=='Python') & (int(l[2])==100))).count()
df_ncs.where("course=='Python' and score==100").count()
spark.sql("select course,score from sql_ncs where score=100 and course='Python'").count()
 
#Tom选修了几门课?每门课多少分?
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).count()
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).map(lambda l:(l[1],l[2])).foreach(print)
df_ncs.filter(df_ncs['name']=='Tom').count()
df_ncs.filter(df_ncs['name']=='Tom').select('course','score').show()
spark.sql("select count(distinct course) from sql_ncs where name=='Tom'").show()
spark.sql("select course,score from sql_ncs where name=='Tom'").show()
 
#Tom的成绩按分数大小排序。(注意,此处排序好像不能用foreach(print)打印输出,胡乱打印)
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).map(lambda l:(l[1],l[2])).sortBy(lambda l:int(l[1]),False).collect()
df_ncs[df_ncs.name=='Tom'].select('course','score').sort(df_ncs.score.desc()).show()
spark.sql("select course,score from sql_ncs where name=='Tom' order by score desc").show()
 
#Tom选修了哪几门课?
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).map(lambda l:l[1]).foreach(print)
df_ncs.filter(df_ncs['name']=='Tom').select('course').show()
spark.sql("select course from sql_ncs where name=='Tom'").show()
 
#Tom的平均分。
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).map(lambda l:int(l[2])).mean()
df_ncs.filter(df_ncs['name']=='Tom').agg({'score':'mean'}).show()
spark.sql("select avg(score) from sql_ncs where name=='Tom'").show()
 
#'OperatingSystem'不及格人数
lines.map(lambda l:l.split(',')).filter(lambda l:((l[1]=='OperatingSystem') & (int(l[2])<60))).count()
df_ncs.where("score<60 and course=='OperatingSystem'").count()
spark.sql("select count(name) from sql_ncs where score<60 and course=='OperatingSystem'").show()
 
#'OperatingSystem'平均分
lines.map(lambda l:l.split(',')).filter(lambda l:(l[1]=='OperatingSystem')).map(lambda l:int(l[2])).mean()
df_ncs.filter(df_ncs['course']=='OperatingSystem').agg({'score':'mean'}).show()
spark.sql("select avg(score) from sql_ncs where course=='OperatingSystem'").show()
 
#'OperatingSystem'90分以上人数
lines.map(lambda l:l.split(',')).filter(lambda l:((l[1]=='OperatingSystem')&(int(l[2])>90))).count()
df_ncs.where("course=='OperatingSystem' and  score>90").count()
spark.sql("select count(name) from sql_ncs where score>90 and course=='OperatingSystem'").show()
 
#'OperatingSystem'前3名
lines.map(lambda l:l.split(',')).filter(lambda l:(l[1]=='OperatingSystem')).sortBy(lambda l:int(l[2]),False).take(3)
df_ncs[df_ncs.course=='OperatingSystem'].sort(df_ncs.score.desc()).head(3)
spark.sql("select name from sql_ncs where course=='OperatingSystem' order by score desc limit 3").show()
 
#每个分数按比例+20平时分。
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],l[1],(int(l[2])+20))).take(2)
df_ncs.select(df_ncs['score']+20).first()
spark.sql("select name,course,score+20 from sql_ncs").show()
 
#求每门课的平均分( lines.getNumPartitions() )
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).combineByKey(lambda v:(v,1),lambda c,v:(c[0]+v,c[1]+1),lambda c1,c2:(c1[0]+c2[0],c1[1]+c2[1])).mapValues(lambda l:l[0]/l[1]).take(3)
df_ncs.groupBy('course').avg('score').show()
spark.sql("select course,avg(score) from sql_ncs group by course").show()
 
#选修了7门课的有多少个学生?
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda l:l[1]==7).count()
df_ncs.groupby('name').agg({'name':'count'}).where("count(name)==7").count()
spark.sql("select 1 from sql_ncs group by name having count(distinct course)=7").count()
 
#每门课大于95分的学生数
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).filter(lambda l:l[1]>95).countByKey()
df_ncs.where("score>95").groupby('course').count().show()
spark.sql("select course,count(distinct name) from sql_ncs where score>95 group by course").show()
 
#每门课的选修人数(结果不一致?)、平均分、不及格人数、通过率
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).countByKey()
df_ncs.groupby('course').count().show()
spark.sql("select course,count(distinct name) from sql_ncs group by course").show()
 
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).combineByKey(lambda v:(v,1),lambda c,v:(c[0]+v,c[1]+1),lambda c1,c2:(c1[0]+c2[0],c1[1]+c2[1])).mapValues(lambda l:l[0]/l[1]).foreach(print)
df_ncs.groupby('course').agg({'score':'mean'}).show()
spark.sql("select course,avg(score) from sql_ncs group by course").show()
 
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).filter(lambda l:l[1]<60).countByKey()
df_ncs.where("score<60").groupby('course').count().show()
spark.sql("select course,count(1) from sql_ncs where score<60 group by course").show()
 
#每门课通过率
fz = lines.map(lambda l:l.split(',')).filter(lambda l:int(l[2])>60).map(lambda l:(l[1],int(l[2]))).countByKey()
fm = lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).countByKey()
for i in fm:
	print(i,round(fz[i]/fm[i],2))
 
 
#优秀、良好、通过和不合格各有多少人?
 
#同时选修了DataStructure和 DataBase 的学生
#选修了DataStructure 但没有选修 DataBase 的学生
#选修课程数少于3门的同学
#选修6门及以上课程数的同学
#查询平均成绩大于等于60分的姓名和平均成绩
#找出平均分最高的10位同学
 
                    
                
 
                
            
         浙公网安备 33010602011771号
浙公网安备 33010602011771号