8. SparkSQL综合作业

综合练习:学生课程分数
点击查看代码
###  网盘下载sc.txt文件,分别用RDD操作、DataFrame操作和spark.sql执行SQL语句实现以下数据分析

## 导入本次实验所需要用到的包
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
from pyspark.sql import functions as f
lines=sc.textFile('file:///home/ywm/sc.txt')
lines.take(1)
df_ncs=spark.createDataFrame(lines.map(lambda l:l.split(',')).map(lambda l:Row(name=l[0],course=l[1],score=int(l[2]))))
df_ncs.printSchema()
df_ncs.first()
df_ncs.createOrReplaceTempView('sql_ncs')
spark.sql("select * from sql_ncs limit 3").show()

#1 总共有多少学生?
lines.map(lambda l:l.split(',')).map(lambda l:l[0]).distinct().count()
df_ncs.select('name').distinct().count()
spark.sql("select count(distinct name) from sql_ncs").show()

#2 总共开设了多少门课程?
lines.map(lambda l:l.split(',')).map(lambda l:l[1]).distinct().count()
df_ncs.select('course').distinct().count()
spark.sql("select count(distinct course) from sql_ncs").show()

#3 每个学生选修了多少门课?
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],l[1])).countByKey()
df_ncs.groupBy('name').count().head(2)
spark.sql("select name,count(1) from sql_ncs group by name").show()

#4 每门课程有多少个学生选?
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],l[0])).countByKey()
df_ncs.groupBy('course').count().head(3)
spark.sql("select course, count(1) from sql_ncs group by course").show()

#5 每门课程>95分的学生人数
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],l[2])).filter(lambda l:(int(l[1])>95)).countByKey()
df_ncs.filter(df_ncs['score'] > 95).groupBy('course').count().head(2)
spark.sql("select course, count(1) from sql_ncs where score>95 group by course").show()

#6 课程'Python'有多少个100分?
lines.map(lambda l:l.split(',')).filter(lambda l:((l[1]=='Python') & (int(l[2])==100))).count()
df_ncs.where("course=='Python' and score==100").count()
spark.sql("select course,score from sql_ncs where score=100 and course='Python'").count()

#7 Tom选修了几门课?每门课多少分?
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).count()
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).map(lambda l:(l[1],l[2])).foreach(print)
df_ncs.filter(df_ncs['name']=='Tom').count()
df_ncs.filter(df_ncs['name']=='Tom').select('course','score').show()
spark.sql("select count(distinct course) from sql_ncs where name=='Tom'").show()
spark.sql("select course,score from sql_ncs where name=='Tom'").show()

#8 Tom的成绩按分数大小排序。(注意,此处排序好像不能用foreach(print)打印输出,胡乱打印)
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).map(lambda l:(l[1],l[2])).sortBy(lambda l:int(l[1]),False).collect()
df_ncs[df_ncs.name=='Tom'].select('course','score').sort(df_ncs.score.desc()).show()
spark.sql("select course,score from sql_ncs where name=='Tom' order by score desc").show()

#9 Tom选修了哪几门课?
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).map(lambda l:l[1]).foreach(print)
df_ncs.filter(df_ncs['name']=='Tom').select('course').show()
spark.sql("select course from sql_ncs where name=='Tom'").show()

#10 Tom的平均分。
lines.map(lambda l:l.split(',')).filter(lambda l:(l[0]=='Tom')).map(lambda l:int(l[2])).mean()
df_ncs.filter(df_ncs['name']=='Tom').agg({'score':'mean'}).show()
spark.sql("select avg(score) from sql_ncs where name=='Tom'").show()

#11 'OperatingSystem'不及格人数
lines.map(lambda l:l.split(',')).filter(lambda l:((l[1]=='OperatingSystem') & (int(l[2])<60))).count()
df_ncs.where("score<60 and course=='OperatingSystem'").count()
spark.sql("select count(name) from sql_ncs where score<60 and course=='OperatingSystem'").show()

#12 'OperatingSystem'平均分
lines.map(lambda l:l.split(',')).filter(lambda l:(l[1]=='OperatingSystem')).map(lambda l:int(l[2])).mean()
df_ncs.filter(df_ncs['course']=='OperatingSystem').agg({'score':'mean'}).show()
spark.sql("select avg(score) from sql_ncs where course=='OperatingSystem'").show()

#13 'OperatingSystem'90分以上人数
lines.map(lambda l:l.split(',')).filter(lambda l:((l[1]=='OperatingSystem')&(int(l[2])>90))).count()
df_ncs.where("course=='OperatingSystem' and  score>90").count()
spark.sql("select count(name) from sql_ncs where score>90 and course=='OperatingSystem'").show()

#14 'OperatingSystem'前3名
lines.map(lambda l:l.split(',')).filter(lambda l:(l[1]=='OperatingSystem')).sortBy(lambda l:int(l[2]),False).take(3)
df_ncs[df_ncs.course=='OperatingSystem'].sort(df_ncs.score.desc()).head(3)
spark.sql("select name from sql_ncs where course=='OperatingSystem' order by score desc limit 3").show()

#15 每个分数按比例+20平时分。
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],l[1],(int(l[2])*8/10+20))).take(2)
df_ncs.select('name','course',(df_ncs['score']*8/10+20).alias('score')).first()
spark.sql("select name,course,score*8/10+20 as score from sql_ncs").show()

#16 求每门课的平均分( lines.getNumPartitions() )
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).combineByKey(lambda v:(v,1),lambda c,v:(c[0]+v,c[1]+1),lambda c1,c2:(c1[0]+c2[0],c1[1]+c2[1])).mapValues(lambda l:l[0]/l[1]).take(3)
df_ncs.groupBy('course').avg('score').show()
spark.sql("select course,avg(score) from sql_ncs group by course").show()

#17 选修了7门课的有多少个学生?
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda l:l[1]==7).count()
df_ncs.groupby('name').agg({'name':'count'}).where("count(name)==7").count()
spark.sql("select 1 from sql_ncs group by name having count(distinct course)=7").count()

#18 每门课大于95分的学生数
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).filter(lambda l:l[1]>95).countByKey()
df_ncs.where("score>95").groupby('course').count().show()
spark.sql("select course,count(distinct name) from sql_ncs where score>95 group by course").show()

#19 每门课的选修人数、平均分、不及格人数、通过率
lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).combineByKey(lambda v:(v,1),lambda a,b:(a[0]+b,a[1]+1),lambda p1,p2:(p1[0]+p2[0],p1[1]+p2[1])).map(lambda l:(l[0],(l[1][1],round(l[1][0]/l[1][1],2)))).leftOuterJoin(lines.map(lambda l:l.split(',')).map(lambda l:(l[1],int(l[2]))).filter(lambda l:l[1]<60).map(lambda l:(l[0],1)).reduceByKey(lambda a,b:a+b)).mapValues(lambda l:(l[0][0],l[0][1],l[1],round((l[0][0]-l[1])/l[0][0],2))).foreach(print)

df1=df_ncs.groupBy('course').agg(f.count('name').alias('stus'),f.bround(f.avg('score'),2).alias('avg'))
df2=df_ncs.where("score<60").groupby('course').agg(f.count('course').alias('stus_fail'))
df1.join(df2,['course'],'left').select('course','stus','avg','stus_fail',f.bround(((df1['stus']-df2['stus_fail'])/df1['stus']),2).alias('pass')).show()

spark.sql("select course,count(1) as stus, round(avg(score),2) as avg from sql_ncs group by course").createOrReplaceTempView('t1')
spark.sql("select course,count(1) as fail_stus from sql_ncs where score<60 group by course").createOrReplaceTempView('t2')
spark.sql("select t1.course,t1.stus,t1.avg,t2.fail_stus,round((t1.stus-t2.fail_stus)/t1.stus,2) as pass from t1 left join t2 on t1.course=t2.course").show()

#20 优秀、良好、通过和不合格各有多少人?90,80,60(人数)
def level(score):
	if score>=90:    return 'a'
	if score>=80:    return 'b'
	if score>=60:    return 'c'
	else:   return 'd'
lines.map(lambda l:l.split(',')).map(lambda l:(level(int(l[2])),l[0])).countByKey()
df_ncs.groupBy(f.when(df_ncs.score>=90,'a').when(df_ncs.score>=80,'b').when(df_ncs.score>=60,'c').otherwise('d').alias('level')).agg(f.count('course').alias('stus')).show()
spark.sql("select case when score>=90 then 'a' when score>=80 then 'b' when score >=60 then 'c' else 'd' end as level, count(1) as stus from sql_ncs group by level").show()

#21 同时选修了DataStructure和 DataBase 的学生
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],l[1])).filter(lambda l:(l[1]=='DataStructure' or l[1]=='DataBase')).distinct().map(lambda l:(l[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda l:l[1]==2).take(3)

df_ncs.select('name','course').where(df_ncs['course']=='DataBase').join(df_ncs.select('name','course').where(df_ncs.course=='DataStructure').withColumnRenamed('course','course1'),['name'],'inner').distinct().take(3)

spark.sql("select name,course from sql_ncs where course='DataStructure'").createOrReplaceTempView('temp1')
spark.sql("select name,course from sql_ncs where course='DataBase'").createOrReplaceTempView('temp2')
spark.sql("select distinct(temp1.name) from temp1 left join temp2 on temp1.name=temp2.name where temp2.name is not null").take(3)

#22 选修了DataStructure 但没有选修 DataBase 的学生
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],l[1])).groupByKey().mapValues(list).filter(lambda l:('DataBase' not in l[1] and 'DataStructure' in l[1])).take(3)

df_ncs.select('name','course').where(df_ncs['course']=='DataBase').join(df_ncs.select('name','course').where(df_ncs.course=='DataStructure').withColumnRenamed('course','course1'),['name'],'right').filter(df_ncs['course'].isNull())

df1=df_ncs.select('name','course').where(df_ncs.course=='DataStructure')
df2=df_ncs.select('name','course').where(df_ncs.course=='DataBase').withColumnRenamed('course','course1')
df1.join(df2,'name','left').where(df2.course1.isNull()).take(3)

spark.sql("select name,course from sql_ncs where course='DataStructure'").createOrReplaceTempView('temp1')
spark.sql("select name,course from sql_ncs where course='DataBase'").createOrReplaceTempView('temp2')
spark.sql("select distinct(temp1.name) from temp1 left join temp2 on temp1.name=temp2.name where temp2.name is null").take(2)

#23 选修课程数少于3门的同学
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],l[1])).distinct().map(lambda l:(l[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda l:l[1]<3).take(3)
df_ncs.select('name','course').groupby('name').agg(f.count('course')).where("count(course)<3").take(3)
spark.sql('select name,count(1) from sql_ncs group by name having count(course)<3').take(3)

#24 选修6门及以上课程数的同学
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],l[1])).distinct().map(lambda l:(l[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda l:l[1]>=6).take(3)
df_ncs.groupby('name').agg(f.count('course')).where("count(course)>=6").take(3)
spark.sql('select name,count(1) from sql_ncs group by name having count(course)>=6').take(3)

#25 查询平均成绩大于等于60分的姓名和平均成绩
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],int(l[2]))).combineByKey(lambda l:(l,1),lambda c,v:(c[0]+v,c[1]+1),lambda a,b:(a[0]+b[0],a[1]+b[1])).mapValues(lambda l:round(l[0]/l[1],2)).filter(lambda l:l[1]>=60).take(3)
df_ncs.groupBy('name').agg(f.avg('score')).where("avg(score)>=60").take(3)
spark.sql('select name,avg(score) from sql_ncs group by name having avg(score)>=60').take(3)

#26 找出平均分最高的10位同学
lines.map(lambda l:l.split(',')).map(lambda l:(l[0],int(l[2]))).combineByKey(lambda l:(l,1),lambda c,v:(c[0]+v,c[1]+1),lambda a,b:(a[0]+b[0],a[1]+b[1])).mapValues(lambda l:round(l[0]/l[1],2)).sortBy(lambda l:l[1],False).take(10)
df_ncs.groupBy('name').agg(f.avg('score')).sort('avg(score)',ascending=False).take(10)
spark.sql('select name,avg(score) from sql_ncs group by name order by avg(score) desc').take(10)


  • 网盘下载sc.txt文件,分别用RDD操作、DataFrame操作和spark.sql执行SQL语句实现以下数据分析:
  1. 总共有多少学生?
  2. 总共开设了多少门课程?
  3. 每个学生选修了多少门课?
  4. 每门课程有多少个学生选?
  5. 每门课程>95分的学生人数
  6. 课程'Python'有多少个100分?
  7. Tom选修了几门课?每门课多少分?
  8. Tom的成绩按分数大小排序。
  9. Tom选修了哪几门课?
  10. Tom的平均分。
  11. 'OperatingSystem'不及格人数
  12. 'OperatingSystem'平均分
  13. 'OperatingSystem'90分以上人数
  14. 'OperatingSystem'前3名
  15. 每个分数按比例+20平时分。
  16. 求每门课的平均分
  17. 选修了7门课的有多少个学生?
  18. 每门课大于95分的学生数
  19. 每门课的选修人数、平均分、不及格人数、通过率
  20. 优秀、良好、通过和不合格各有多少人?
  21. 同时选修了DataStructure和 DataBase 的学生
  22. 选修了DataStructure 但没有选修 DataBase 的学生
  23. 选修课程数少于3门的同学
  24. 选修6门及以上课程数的同学
  25. 查询平均成绩大于等于60分的姓名和平均成绩
  26. 找出平均分最高的10位同学
posted @ 2022-05-20 00:43  叶文茂  阅读(93)  评论(0编辑  收藏  举报
TOP