8. SparkSQL综合作业

from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc=SparkContext(conf=conf)

url = "file:///home/hadoop/input/sc.txt"

rdd = sc.textFile(url).map(lambda line:line.split(',')).map(lambda line:[line[0],line[1],int(line[2])])
name = rdd.map(lambda a:(a[0],(a[1],a[2])))
from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()

df = spark.createDataFrame(rdd,['name','course','score'])
df.createOrReplaceTempView('sql')
# 总共有多少学生?
print('rdd:',rdd.map(lambda a:a[0]).distinct().count())
print('df:',df.select('name').distinct().count())
spark.sql('SELECT COUNT(DISTINCT(name)) FROM sql').show()
rdd: 265
df: 265
+--------------------+
|count(DISTINCT name)|
+--------------------+
|                 265|
+--------------------+
# 总共开设了多少门课程?
print('rdd:',rdd.map(lambda a:a[1]).distinct().count())
print('df:',df.select('course').distinct().count())
spark.sql('SELECT COUNT(DISTINCT(course)) FROM sql').show()
rdd: 8
df: 8
+----------------------+
|count(DISTINCT course)|
+----------------------+
|                     8|
+----------------------+
# 每个学生选修了多少门课?
print('rdd:',rdd.countByKey())
df.groupBy('name').count().show()
spark.sql('SELECT name,COUNT(course) FROM sql GROUP BY name').show()
rdd: defaultdict(<class 'int'>, {'Aaron': 4, 'Abbott': 3, 'Abel': 4, 'Abraham': 3, 'Adair': 3, 'Adam': 3, 'Adolph': 4, 'Adonis': 5, 'Alan': 5, 'Albert': 3, 'Aldrich': 3, 'Alexander': 4, 'Alfred': 2, 'Alger': 5, 'Allen': 4, 'Alston': 4, 'Alva': 5, 'Alvin': 5, 'Alvis': 6, 'Amos': 5, 'Andrew': 4, 'Andy': 3, 'Angelo': 2, 'Antony': 5, 'Antonio': 3, 'Archer': 5, 'Archibald': 5, 'Aries': 2, 'Arlen': 4, 'Armand': 3, 'Armstrong': 2, 'Baron': 6, 'Barry': 5, 'Bartholomew': 5, 'Bart': 5, 'Barton': 1, 'Basil': 4, 'Beck': 4, 'Ben': 4, 'Benedict': 6, 'Benjamin': 4, 'Bennett': 6, 'Benson': 4, 'Berg': 4, 'Bernard': 2, 'Bernie': 3, 'Bert': 3, 'Bertram': 3, 'Bevis': 4, 'Bill': 2, 'Bing': 6, 'Bishop': 2, 'Blair': 4, 'Blake': 4, 'Blithe': 3, 'Bob': 3, 'Booth': 6, 'Borg': 4, 'Boris': 6, 'Bowen': 5, 'Boyce': 2, 'Boyd': 3, 'Bradley': 2, 'Brady': 5, 'Brandon': 5, 'Brian': 6, 'Broderick': 3, 'Brook': 4, 'Bruce': 3, 'Bruno': 5, 'Chad': 6, 'Channing': 4, 'Chapman': 4, 'Charles': 3, 'Chester': 6, 'Christ': 2, 'Christian': 2, 'Christopher': 4, 'Clare': 4, 'Clarence': 7, 'Clark': 6, 'Claude': 2, 'Clement': 5, 'Cleveland': 4, 'Cliff': 5, 'Clyde': 7, 'Colbert': 4, 'Colby': 4, 'Colin': 5, 'Conrad': 2, 'Corey': 4, 'Dean': 7, 'Dempsey': 4, 'Dennis': 4, 'Derrick': 6, 'Devin': 4, 'Dick': 3, 'Dominic': 4, 'Don': 2, 'Donahue': 5, 'Donald': 4, 'Drew': 5, 'Duke': 4, 'Duncann': 5, 'Edward': 4, 'Egbert': 4, 'Eli': 5, 'Elijah': 4, 'Elliot': 3, 'Ellis': 4, 'Elmer': 4, 'Elroy': 5, 'Elton': 5, 'Elvis': 2, 'Emmanuel': 3, 'Enoch': 3, 'Eric': 4, 'Ernest': 5, 'Eugene': 1, 'Evan': 3, 'Ford': 3, 'Francis': 4, 'Frank': 3, 'Geoffrey': 4, 'George': 4, 'Gerald': 4, 'Gilbert': 3, 'Giles': 7, 'Glenn': 6, 'Gordon': 4, 'Greg': 4, 'Griffith': 4, 'Harlan': 6, 'Harold': 4, 'Harry': 4, 'Harvey': 7, 'Hayden': 3, 'Henry': 2, 'Herbert': 3, 'Herman': 3, 'Hilary': 4, 'Hiram': 6, 'Hobart': 4, 'Hogan': 4, 'Horace': 5, 'Ivan': 4, 'Jason': 4, 'Jay': 6, 'Jeff': 4, 'Jeffrey': 4, 'Jeremy': 6, 'Jerome': 3, 'Jerry': 3, 'Jesse': 7, 'Jim': 4, 'Jo': 5, 'John': 6, 'Jonas': 4, 'Jonathan': 4, 'Joseph': 3, 'Joshua': 4, 'Ken': 3, 'Kennedy': 4, 'Kenneth': 3, 'Kent': 4, 'Kerr': 4, 'Kerwin': 3, 'Kevin': 4, 'Kim': 4, 'Len': 5, 'Lennon': 4, 'Leo': 5, 'Leonard': 2, 'Leopold': 7, 'Les': 6, 'Lester': 4, 'Levi': 2, 'Lewis': 4, 'Lionel': 4, 'Lou': 2, 'Louis': 6, 'Lucien': 5, 'Luthers': 5, 'Marico': 6, 'Mark': 7, 'Marlon': 4, 'Marsh': 4, 'Marshall': 4, 'Martin': 3, 'Marvin': 3, 'Matt': 4, 'Matthew': 2, 'Maurice': 2, 'Max': 3, 'Maxwell': 4, 'Meredith': 4, 'Merle': 3, 'Merlin': 5, 'Michael': 5, 'Mick': 4, 'Mike': 3, 'Miles': 6, 'Milo': 2, 'Monroe': 3, 'Montague': 3, 'Nelson': 5, 'Newman': 2, 'Nicholas': 5, 'Nick': 5, 'Nigel': 3, 'Noah': 4, 'Payne': 6, 'Perry': 5, 'Pete': 3, 'Peter': 4, 'Phil': 3, 'Philip': 2, 'Randolph': 3, 'Raymondt': 6, 'Robin': 4, 'Rock': 6, 'Rod': 4, 'Roderick': 4, 'Rodney': 3, 'Ron': 6, 'Ronald': 3, 'Rory': 4, 'Roy': 6, 'Samuel': 4, 'Sandy': 1, 'Saxon': 7, 'Scott': 3, 'Sean': 6, 'Sebastian': 6, 'Sid': 3, 'Sidney': 5, 'Simon': 2, 'Solomon': 5, 'Spencer': 5, 'Stan': 3, 'Todd': 3, 'Tom': 5, 'Tony': 3, 'Tracy': 3, 'Truman': 3, 'Upton': 5, 'Uriah': 1, 'Valentine': 8, 'Verne': 3, 'Vic': 3, 'Victor': 2, 'Vincent': 5, 'Virgil': 5, 'Walter': 4, 'Ward': 4, 'Webb': 7, 'Webster': 2, 'Will': 3, 'William': 6, 'Willie': 4, 'Winfred': 3, 'Winston': 4, 'Woodrow': 3, 'Wordsworth': 4, 'Wright': 4})
+-----------+-----+
|       name|count|
+-----------+-----+
|       Chad|    6|
|        Rod|    4|
|      Baron|    6|
|        Len|    5|
|      Scott|    3|
|    Donahue|    5|
|     Gerald|    4|
|     Monroe|    3|
|       Boyd|    3|
|      Alvin|    5|
|     Elliot|    3|
|     Blithe|    3|
|     Willie|    4|
|      Brady|    5|
|Bartholomew|    5|
|     Sidney|    5|
|   Benjamin|    4|
|      Bruno|    5|
|      Clare|    4|
|    Herbert|    3|
+-----------+-----+
only showing top 20 rows

+-----------+-------------+
|       name|count(course)|
+-----------+-------------+
|       Chad|            6|
|        Rod|            4|
|      Baron|            6|
|        Len|            5|
|      Scott|            3|
|    Donahue|            5|
|     Gerald|            4|
|     Monroe|            3|
|       Boyd|            3|
|      Alvin|            5|
|     Elliot|            3|
|     Blithe|            3|
|     Willie|            4|
|      Brady|            5|
|Bartholomew|            5|
|     Sidney|            5|
|   Benjamin|            4|
|      Bruno|            5|
|      Clare|            4|
|    Herbert|            3|
+-----------+-------------+
only showing top 20 rows
# 每门课程有多少个学生选?
print('rdd:',name.values().countByKey())
df.groupBy('course').count().show()
spark.sql('SELECT course,COUNT(course) FROM sql GROUP BY course').show()
rdd: defaultdict(<class 'int'>, {'OperatingSystem': 134, 'Python': 136, 'ComputerNetwork': 142, 'Software': 132, 'DataBase': 126, 'Algorithm': 144, 'DataStructure': 131, 'CLanguage': 128})
+---------------+-----+
|         course|count|
+---------------+-----+
|ComputerNetwork|  142|
|      CLanguage|  128|
|  DataStructure|  131|
|       DataBase|  126|
|      Algorithm|  144|
|OperatingSystem|  134|
|       Software|  132|
|         Python|  136|
+---------------+-----+

+---------------+-------------+
|         course|count(course)|
+---------------+-------------+
|ComputerNetwork|          142|
|      CLanguage|          128|
|  DataStructure|          131|
|       DataBase|          126|
|      Algorithm|          144|
|OperatingSystem|          134|
|       Software|          132|
|         Python|          136|
+---------------+-------------+
# 每门课程>95分的学生人数
print(rdd.filter(lambda a:a[2]>95).map(lambda a:(a[1],a[2])).countByKey())
df.filter(df['score']>95).groupBy('course').count().show()
spark.sql('SELECT course,count(course) FROM sql WHERE score > 95 GROUP BY course').show()
defaultdict(<class 'int'>, {'OperatingSystem': 11, 'Software': 5, 'Python': 18, 'CLanguage': 14, 'Algorithm': 7, 'DataBase': 4, 'ComputerNetwork': 10, 'DataStructure': 5})
+---------------+-----+
|         course|count|
+---------------+-----+
|ComputerNetwork|   10|
|      CLanguage|   14|
|  DataStructure|    5|
|       DataBase|    4|
|      Algorithm|    7|
|OperatingSystem|   11|
|       Software|    5|
|         Python|   18|
+---------------+-----+

+---------------+-------------+
|         course|count(course)|
+---------------+-------------+
|ComputerNetwork|           10|
|      CLanguage|           14|
|  DataStructure|            5|
|       DataBase|            4|
|      Algorithm|            7|
|OperatingSystem|           11|
|       Software|            5|
|         Python|           18|
+---------------+-------------+
# 课程'Python'有多少个100分?
print('rdd:',rdd.filter(lambda a:a[1] == 'Python' and a[2] == 100).count())
print(df.filter(df['course'] == 'Python').filter(df['score'] == 100).count())
spark.sql('SELECT COUNT("Python") FROM sql WHERE course = "Python" and score = 100').show()
rdd: 5
5
+-------------+
|count(Python)|
+-------------+
|            5|
+-------------+
# Tom选修了几门课?每门课多少分?
print('rdd',rdd.filter(lambda a:a[0] == 'Tom').count(),rdd.filter(lambda a:a[0] == 'Tom').collect())

print('df',df.filter(df['name'] == 'Tom').count())
df.filter(df['name'] == 'Tom').show()

spark.sql('SELECT COUNT(course) FROM sql WHERE name = "Tom"').show()
spark.sql('SELECT course,score FROM sql WHERE name = "Tom"').show()
rdd 5 [['Tom', 'DataBase', 26], ['Tom', 'Algorithm', 12], ['Tom', 'OperatingSystem', 16], ['Tom', 'Python', 40], ['Tom', 'Software', 60]]
df 5
+----+---------------+-----+
|name|         course|score|
+----+---------------+-----+
| Tom|       DataBase|   26|
| Tom|      Algorithm|   12|
| Tom|OperatingSystem|   16|
| Tom|         Python|   40|
| Tom|       Software|   60|
+----+---------------+-----+

+-------------+
|count(course)|
+-------------+
|            5|
+-------------+

+---------------+-----+
|         course|score|
+---------------+-----+
|       DataBase|   26|
|      Algorithm|   12|
|OperatingSystem|   16|
|         Python|   40|
|       Software|   60|
+---------------+-----+
# Tom的成绩按分数大小排序
print(rdd.filter(lambda a:a[0] == 'Tom').sortBy(lambda a:a[2],False).collect())

df.filter(df['name'] == 'Tom').sort(-df['score']).show()

spark.sql('SELECT * FROM sql WHERE name = "Tom" ORDER BY score DESC').show()
[['Tom', 'Software', 60], ['Tom', 'Python', 40], ['Tom', 'DataBase', 26], ['Tom', 'OperatingSystem', 16], ['Tom', 'Algorithm', 12]]
+----+---------------+-----+
|name|         course|score|
+----+---------------+-----+
| Tom|       Software|   60|
| Tom|         Python|   40|
| Tom|       DataBase|   26|
| Tom|OperatingSystem|   16|
| Tom|      Algorithm|   12|
+----+---------------+-----+

+----+---------------+-----+
|name|         course|score|
+----+---------------+-----+
| Tom|       Software|   60|
| Tom|         Python|   40|
| Tom|       DataBase|   26|
| Tom|OperatingSystem|   16|
| Tom|      Algorithm|   12|
+----+---------------+-----+
# Tom选修了哪几门课
print(rdd.filter(lambda a:a[0] == 'Tom').collect())

df.filter(df['name'] == 'Tom').show()

spark.sql('SELECT * FROM sql WHERE name = "Tom"').show()
[['Tom', 'DataBase', 26], ['Tom', 'Algorithm', 12], ['Tom', 'OperatingSystem', 16], ['Tom', 'Python', 40], ['Tom', 'Software', 60]]
+----+---------------+-----+
|name|         course|score|
+----+---------------+-----+
| Tom|       DataBase|   26|
| Tom|      Algorithm|   12|
| Tom|OperatingSystem|   16|
| Tom|         Python|   40|
| Tom|       Software|   60|
+----+---------------+-----+

+----+---------------+-----+
|name|         course|score|
+----+---------------+-----+
| Tom|       DataBase|   26|
| Tom|      Algorithm|   12|
| Tom|OperatingSystem|   16|
| Tom|         Python|   40|
| Tom|       Software|   60|
+----+---------------+-----+
# Tom的平均分
import numpy as np
print(np.mean(name.filter(lambda a:a[0] == 'Tom').values().values().collect()))

df.filter(df['name'] == 'Tom').agg({'score':'avg'}).show()

spark.sql('SELECT AVG(score) FROM sql WHERE name = "Tom"').show()
30.8
+----------+
|avg(score)|
+----------+
|      30.8|
+----------+

+----------+
|avg(score)|
+----------+
|      30.8|
+----------+
# 'OperatingSystem'不及格人数
print('rdd',rdd.filter(lambda a:a[1] == 'OperatingSystem' and a[2] < 60).count())

print('df',df.filter(df['course'] == 'OperatingSystem').filter(df['score'] < 60).count())

spark.sql('SELECT COUNT(score) FROM sql WHERE course = "OperatingSystem" AND score < 60').show()
rdd 63
df 63
+------------+
|count(score)|
+------------+
|          63|
+------------+
# 'OperatingSystem'平均分
print('rdd',np.mean(rdd.filter(lambda a:a[1] == 'OperatingSystem').map(lambda a:(a[1],a[2])).values().collect()))

df.filter(df['course'] == 'OperatingSystem').agg({'score':'avg'}).show()

spark.sql('SELECT AVG(score) FROM sql WHERE course = "OperatingSystem"').show()
rdd 54.940298507462686
+------------------+
|        avg(score)|
+------------------+
|54.940298507462686|
+------------------+

+------------------+
|        avg(score)|
+------------------+
|54.940298507462686|
+------------------+
# 'OperatingSystem'90分以上人数
print('rdd',rdd.filter(lambda a:a[1] == 'OperatingSystem' and a[2] > 90).count())

print('df',df.filter(df['course'] == 'OperatingSystem').filter(df['score'] > 90).count())

spark.sql('SELECT COUNT("OperatingSystem > 90") FROM sql WHERE course = "OperatingSystem" and score >90').show()
rdd 15
df 15
+---------------------------+
|count(OperatingSystem > 90)|
+---------------------------+
|                         15|
+---------------------------+
# 'OperatingSystem'前3名
print(rdd.filter(lambda a:a[1] == 'OperatingSystem').sortBy(lambda a:a[2],False).take(3))

print(df.filter(df['course'] == 'OperatingSystem').sort(-df['score']).head(3))

spark.sql('SELECT * FROM sql WHERE course = "OperatingSystem" ORDER BY score desc limit 3').show()
[['Aaron', 'OperatingSystem', 100], ['Bing', 'OperatingSystem', 100], ['Evan', 'OperatingSystem', 100]]
[Row(name='Harold', course='OperatingSystem', score=100), Row(name='Evan', course='OperatingSystem', score=100), Row(name='Aaron', course='OperatingSystem', score=100)]
+------+---------------+-----+
|  name|         course|score|
+------+---------------+-----+
|Harold|OperatingSystem|  100|
|  Evan|OperatingSystem|  100|
| Aaron|OperatingSystem|  100|
+------+---------------+-----+
# 每个分数按比例+20平时分
print(rdd.map(lambda a:(a[0],a[1],a[2]+20)).take(3))

print(df.select(df['score']+20).head(3))

print(spark.sql('SELECT (score + 20) AS score FROM sql').head(3))
[('Aaron', 'OperatingSystem', 120), ('Aaron', 'Python', 70), ('Aaron', 'ComputerNetwork', 50)]
[Row((score + 20)=120), Row((score + 20)=70), Row((score + 20)=50)]
[Row(score=120), Row(score=70), Row(score=50)]
# 求每门课的平均分
course = rdd.map(lambda a:(a[1],a[2]))
print(course.combineByKey(lambda a:(a,1),lambda a,b:(a[0]+b,a[1]+1),lambda a,b:a+b).map(lambda a:(a[0],a[1][0]/a[1][1])).collect())

df.groupBy('course').avg('score').show()

spark.sql('SELECT course,AVG(score) FROM sql GROUP BY course').show()
[('OperatingSystem', 54.940298507462686), ('Python', 57.8235294117647), ('ComputerNetwork', 51.901408450704224), ('Software', 50.90909090909091), ('DataBase', 50.53968253968254), ('Algorithm', 48.833333333333336), ('DataStructure', 47.57251908396947), ('CLanguage', 50.609375)]
+---------------+------------------+
|         course|        avg(score)|
+---------------+------------------+
|ComputerNetwork|51.901408450704224|
|      CLanguage|         50.609375|
|  DataStructure| 47.57251908396947|
|       DataBase| 50.53968253968254|
|      Algorithm|48.833333333333336|
|OperatingSystem|54.940298507462686|
|       Software| 50.90909090909091|
|         Python|  57.8235294117647|
+---------------+------------------+

+---------------+------------------+
|         course|        avg(score)|
+---------------+------------------+
|ComputerNetwork|51.901408450704224|
|      CLanguage|         50.609375|
|  DataStructure| 47.57251908396947|
|       DataBase| 50.53968253968254|
|      Algorithm|48.833333333333336|
|OperatingSystem|54.940298507462686|
|       Software| 50.90909090909091|
|         Python|  57.8235294117647|
+---------------+------------------+
# 选修了7门课的有多少个学生?
print(rdd.map(lambda a:(a[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1] == 7).count())

print(df.groupby('name').count().where("count(name) == 7").count())

print(spark.sql('SELECT name,COUNT(name) AS count FROM sql GROUP BY name HAVING count = 7').count())
10
10
10
# 每门课大于95分的学生数
print(rdd.filter(lambda a:a[2]>95).map(lambda a:(a[1],a[2])).countByKey())
df.filter(df['score']>95).groupBy('course').count().show()
spark.sql('SELECT course,count(course) FROM sql WHERE score > 95 GROUP BY course').show()
defaultdict(<class 'int'>, {'OperatingSystem': 11, 'Software': 5, 'Python': 18, 'CLanguage': 14, 'Algorithm': 7, 'DataBase': 4, 'ComputerNetwork': 10, 'DataStructure': 5})
+---------------+-----+
|         course|count|
+---------------+-----+
|ComputerNetwork|   10|
|      CLanguage|   14|
|  DataStructure|    5|
|       DataBase|    4|
|      Algorithm|    7|
|OperatingSystem|   11|
|       Software|    5|
|         Python|   18|
+---------------+-----+

+---------------+-------------+
|         course|count(course)|
+---------------+-------------+
|ComputerNetwork|           10|
|      CLanguage|           14|
|  DataStructure|            5|
|       DataBase|            4|
|      Algorithm|            7|
|OperatingSystem|           11|
|       Software|            5|
|         Python|           18|
+---------------+-------------+
# 每门课的选修人数、平均分、不及格人数、通过率
course = rdd.map(lambda a:(a[1],a[2]))

print(course.countByKey())
print(course.combineByKey(lambda a:(a,1),lambda a,b:(a[0]+b,a[1]+1),lambda a,b:a+b).map(lambda a:(a[0],a[1][0]/a[1][1])).collect())
print(course.filter(lambda a:a[1] < 60).countByKey())
rdd.map(lambda a:(a[1], 1)).reduceByKey(lambda a,b:a+b).join(
  rdd.map(lambda a:(a[1], a[2])).filter(lambda a:int(a[1]) < 60).map(lambda a:(a[0], 1)).reduceByKey(lambda a,b:a+b)
).map(lambda a:(a[0], round(1-(a[1][1]/a[1][0]),2))).collect()
defaultdict(<class 'int'>, {'OperatingSystem': 134, 'Python': 136, 'ComputerNetwork': 142, 'Software': 132, 'DataBase': 126, 'Algorithm': 144, 'DataStructure': 131, 'CLanguage': 128})
[('OperatingSystem', 54.940298507462686), ('Python', 57.8235294117647), ('ComputerNetwork', 51.901408450704224), ('Software', 50.90909090909091), ('DataBase', 50.53968253968254), ('Algorithm', 48.833333333333336), ('DataStructure', 47.57251908396947), ('CLanguage', 50.609375)]
defaultdict(<class 'int'>, {'Python': 67, 'ComputerNetwork': 80, 'DataBase': 71, 'Algorithm': 90, 'DataStructure': 84, 'OperatingSystem': 63, 'Software': 76, 'CLanguage': 78})





[('OperatingSystem', 0.53),
 ('Python', 0.51),
 ('ComputerNetwork', 0.44),
 ('Software', 0.42),
 ('DataBase', 0.44),
 ('Algorithm', 0.38),
 ('DataStructure', 0.36),
 ('CLanguage', 0.39)]
df1 = df.groupBy('course').count().withColumnRenamed('count','totalCount').join(
    df.groupBy('course').avg().withColumnRenamed('avg(score)','avg'),"course"
).join(
    df.filter(df['score'] < 60).groupBy('course').count().withColumnRenamed('count','notPass'),'course'
)
df1.select(
    'course','totalCount','avg','notPass',1-df1['notPass']/df1['totalCount']
).withColumnRenamed('(1 - (notPass / totalCount))','passRat').show()
+---------------+----------+------------------+-------+-------------------+
|         course|totalCount|               avg|notPass|            passRat|
+---------------+----------+------------------+-------+-------------------+
|ComputerNetwork|       142|51.901408450704224|     80|0.43661971830985913|
|      CLanguage|       128|         50.609375|     78|           0.390625|
|  DataStructure|       131| 47.57251908396947|     84| 0.3587786259541985|
|       DataBase|       126| 50.53968253968254|     71| 0.4365079365079365|
|      Algorithm|       144|48.833333333333336|     90|              0.375|
|OperatingSystem|       134|54.940298507462686|     63| 0.5298507462686567|
|       Software|       132| 50.90909090909091|     76| 0.4242424242424242|
|         Python|       136|  57.8235294117647|     67| 0.5073529411764706|
+---------------+----------+------------------+-------+-------------------+
spark.sql('SELECT course, COUNT(name) AS totalCount,avg(score) AS avg FROM sql GROUP BY course').createOrReplaceTempView("a")
spark.sql('SELECT course,COUNT(score) AS notPass FROM sql WHERE score<60 GROUP BY course').createOrReplaceTempView("b")
spark.sql('SELECT a.course,a.totalCount,round(a.avg,2)AS avg,b.notPass ,round((a.totalCount-b.notPass)/a.totalCount,2) AS passRat FROM a left JOIN b ON a.course=b.course').show()
+---------------+----------+-----+-------+-------+
|         course|totalCount|  avg|notPass|passRat|
+---------------+----------+-----+-------+-------+
|ComputerNetwork|       142| 51.9|     80|   0.44|
|      CLanguage|       128|50.61|     78|   0.39|
|  DataStructure|       131|47.57|     84|   0.36|
|       DataBase|       126|50.54|     71|   0.44|
|      Algorithm|       144|48.83|     90|   0.38|
|OperatingSystem|       134|54.94|     63|   0.53|
|       Software|       132|50.91|     76|   0.42|
|         Python|       136|57.82|     67|   0.51|
+---------------+----------+-----+-------+-------+
# 优秀、良好、通过和不合格各有多少人?
print('优秀:',rdd.filter(lambda a:a[2] >= 90).count())
print('良好:',rdd.filter(lambda a:a[2] >= 75 and a[2] < 90).count())
print('通过:',rdd.filter(lambda a:a[2] >= 60 and a[2] < 75).count())
print('不及格:',rdd.filter(lambda a:a[2] < 60).count())
优秀: 138
良好: 143
通过: 183
不及格: 609
print('优秀:',df.filter(df['score'] >= 90).count())
print('良好:',df.filter((df['score'] >= 75) & (df['score'] < 90)).count())
print('通过:',df.filter((df['score'] >= 60) & (df['score'] < 75)).count())
print('不及格:',df.filter(df['score'] < 60).count())
优秀: 138
良好: 143
通过: 183
不及格: 609
spark.sql('SELECT count("优秀") FROM sql WHERE score >= 90').show()
spark.sql('SELECT count("良好") FROM sql WHERE score >= 75 AND score < 90').show()
spark.sql('SELECT count("通过") FROM sql WHERE score >= 60 AND score < 75').show()
spark.sql('SELECT count("不及格") FROM sql WHERE score < 60').show()
+-----------+
|count(优秀)|
+-----------+
|        138|
+-----------+

+-----------+
|count(良好)|
+-----------+
|        143|
+-----------+

+-----------+
|count(通过)|
+-----------+
|        183|
+-----------+

+-------------+
|count(不及格)|
+-------------+
|          609|
+-------------+
# 同时选修了DataStructure和 DataBase 的学生
DAD = rdd.filter(lambda a:a[1] == 'DataStructure' or a[1] == 'DataBase')
print(DAD.map(lambda a:(a[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1] > 1).map(lambda a:a[0]).collect())

print(df[(df['course'] == 'DataStructure') | (df['course'] == 'DataBase')].groupBy('name').count().where('count > 1').select('name').show())

spark.sql('SELECT name,COUNT(name) AS count FROM sql WHERE course = "DataStructure" OR course = "DataBase" \
          GROUP BY name HAVING count > 1').select('name').show()
['Adonis', 'Alva', 'Amos', 'Armand', 'Bart', 'Beck', 'Benedict', 'Bennett', 'Bing', 'Bowen', 'Brady', 'Brandon', 'Bruno', 'Chad', 'Chester', 'Christopher', 'Clarence', 'Clark', 'Clyde', 'Colby', 'Dean', 'Devin', 'Donahue', 'Edward', 'Elroy', 'Elton', 'Elvis', 'Giles', 'Harvey', 'Hiram', 'Hobart', 'Jeff', 'Jeremy', 'Jerome', 'Jo', 'Kennedy', 'Kent', 'Kevin', 'Kim', 'Leopold', 'Les', 'Louis', 'Marico', 'Mark', 'Matt', 'Miles', 'Nelson', 'Nicholas', 'Payne', 'Peter', 'Raymondt', 'Ron', 'Roy', 'Saxon', 'Sidney', 'Solomon', 'Upton', 'Valentine', 'Webb', 'William']
+-----------+
|       name|
+-----------+
|       Chad|
|    Donahue|
|      Brady|
|     Sidney|
|      Bruno|
|  Valentine|
|       Alva|
|       Webb|
|         Jo|
|        Roy|
|      Upton|
|       Amos|
|    Bennett|
|      Payne|
|        Les|
|     Marico|
|       Dean|
|   Raymondt|
|Christopher|
|      Elvis|
+-----------+
only showing top 20 rows

None
+-----------+
|       name|
+-----------+
|       Chad|
|    Donahue|
|      Brady|
|     Sidney|
|      Bruno|
|  Valentine|
|       Alva|
|       Webb|
|         Jo|
|        Roy|
|      Upton|
|       Amos|
|    Bennett|
|      Payne|
|        Les|
|     Marico|
|       Dean|
|   Raymondt|
|Christopher|
|      Elvis|
+-----------+
only showing top 20 rows
# 选修了DataStructure 但没有选修 DataBase 的学生
DOD = rdd.filter(lambda a:a[1] == 'DataStructure' or a[1] == 'DataBase')
print(DOD.map(lambda a:(a[0],a[1])).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1] == 'DataStructure').map(lambda a:a[0]).collect())
['Abel', 'Abraham', 'Adolph', 'Albert', 'Alexander', 'Alston', 'Alvis', 'Andrew', 'Aries', 'Arlen', 'Baron', 'Barry', 'Benjamin', 'Benson', 'Berg', 'Bernard', 'Bernie', 'Blithe', 'Boris', 'Boyd', 'Brian', 'Broderick', 'Brook', 'Channing', 'Christ', 'Christian', 'Clare', 'Cliff', 'Dempsey', 'Dennis', 'Dick', 'Drew', 'Duncann', 'Elmer', 'Evan', 'George', 'Greg', 'Griffith', 'Harold', 'Hayden', 'Hilary', 'Jay', 'Jeffrey', 'Jerry', 'Jesse', 'Joseph', 'Joshua', 'Lester', 'Lewis', 'Lionel', 'Lucien', 'Luthers', 'Marlon', 'Maurice', 'Max', 'Merlin', 'Mick', 'Mike', 'Perry', 'Pete', 'Randolph', 'Robin', 'Sandy', 'Spencer', 'Stan', 'Virgil', 'Ward', 'Willie', 'Winston', 'Wordsworth']
df1 = df.filter(df['course'] == 'DataBase').select('name','course').withColumnRenamed('course','DataBase')
df2 = df.filter(df['course'] == 'DataStructure').select('name','course').withColumnRenamed('course','DataStructure')
df3 = df1.join(df2,'name','right')
df3.filter(df3['DataBase'].isNull()).select('name').show()
+---------+
|     name|
+---------+
|    Baron|
|     Boyd|
|   Blithe|
|   Willie|
| Benjamin|
|    Clare|
|   Alston|
|   Hayden|
|Christian|
|     Pete|
|   Adolph|
|  Spencer|
|    Alvis|
|    Barry|
|    Sandy|
|   Albert|
|  Bernard|
|   Merlin|
|   Lester|
|  Jeffrey|
+---------+
only showing top 20 rows
spark.sql('SELECT DISTINCT name,course AS DataStructure FROM sql WHERE course = "DataStructure"').createOrReplaceTempView("a")
spark.sql('SELECT DISTINCT name,course AS DataBase FROM sql WHERE course = "DataBase"').createOrReplaceTempView("b")
spark.sql('SELECT a.name FROM a LEFT JOIN b ON a.name = b.name WHERE DataBase is null').show()
+---------+
|     name|
+---------+
|    Baron|
|     Boyd|
|   Blithe|
|   Willie|
| Benjamin|
|    Clare|
|   Alston|
|   Hayden|
|Christian|
|     Pete|
|   Adolph|
|  Spencer|
|    Alvis|
|    Barry|
|    Sandy|
|   Albert|
|  Bernard|
|   Merlin|
|   Lester|
|  Jeffrey|
+---------+
only showing top 20 rows
# 选修课程数少于3门的同学
print(rdd.map(lambda a:(a[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1]<3).map(lambda a:a[0]).collect())

df.groupBy(['name']).count().where("count < 3").select('name').show()

spark.sql('SELECT name,count(course) AS count FROM sql GROUP BY name HAVING count < 3').select('name').show()
['Alfred', 'Angelo', 'Aries', 'Armstrong', 'Barton', 'Bernard', 'Bill', 'Bishop', 'Boyce', 'Bradley', 'Christ', 'Christian', 'Claude', 'Conrad', 'Don', 'Elvis', 'Eugene', 'Henry', 'Leonard', 'Levi', 'Lou', 'Matthew', 'Maurice', 'Milo', 'Newman', 'Philip', 'Sandy', 'Simon', 'Uriah', 'Victor', 'Webster']
+---------+
|     name|
+---------+
|  Matthew|
|Christian|
|    Sandy|
|   Alfred|
|  Bernard|
|  Webster|
|      Don|
|     Bill|
|    Elvis|
|    Aries|
|    Boyce|
|      Lou|
|    Simon|
|   Christ|
|  Maurice|
|  Bradley|
|Armstrong|
|     Milo|
|    Uriah|
|  Leonard|
+---------+
only showing top 20 rows

+---------+
|     name|
+---------+
|  Matthew|
|Christian|
|    Sandy|
|   Alfred|
|  Bernard|
|  Webster|
|      Don|
|     Bill|
|    Elvis|
|    Aries|
|    Boyce|
|      Lou|
|    Simon|
|   Christ|
|  Maurice|
|  Bradley|
|Armstrong|
|     Milo|
|    Uriah|
|  Leonard|
+---------+
only showing top 20 rows
# 选修6门及以上课程数的同学
print(rdd.map(lambda a:(a[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1]>=6).map(lambda a:a[0]).collect())

df.groupBy('name').count().where('count >= 6').select('name').show()

spark.sql('SELECT name,count(name) AS count FROM sql GROUP BY name HAVING count >= 6').select('name').show()
['Alvis', 'Baron', 'Benedict', 'Bennett', 'Bing', 'Booth', 'Boris', 'Brian', 'Chad', 'Chester', 'Clarence', 'Clark', 'Clyde', 'Dean', 'Derrick', 'Giles', 'Glenn', 'Harlan', 'Harvey', 'Hiram', 'Jay', 'Jeremy', 'Jesse', 'John', 'Leopold', 'Les', 'Louis', 'Marico', 'Mark', 'Miles', 'Payne', 'Raymondt', 'Rock', 'Ron', 'Roy', 'Saxon', 'Sean', 'Sebastian', 'Valentine', 'Webb', 'William']
+---------+
|     name|
+---------+
|     Chad|
|    Baron|
|Valentine|
|     Webb|
|      Roy|
|    Alvis|
|     Rock|
|  Bennett|
|    Payne|
|      Les|
|   Marico|
|   Harlan|
|     Dean|
| Raymondt|
|   Harvey|
|     Bing|
|  Derrick|
|    Saxon|
| Clarence|
|     Sean|
+---------+
only showing top 20 rows

+---------+
|     name|
+---------+
|     Chad|
|    Baron|
|Valentine|
|     Webb|
|      Roy|
|    Alvis|
|     Rock|
|  Bennett|
|    Payne|
|      Les|
|   Marico|
|   Harlan|
|     Dean|
| Raymondt|
|   Harvey|
|     Bing|
|  Derrick|
|    Saxon|
| Clarence|
|     Sean|
+---------+
only showing top 20 rows
# 查询平均成绩大于等于60分的姓名和平均成绩
course = rdd.map(lambda a:(a[0],a[2])).combineByKey(lambda a:(a,1),lambda a,b:(a[0]+b,a[1]+1),lambda a,b:a+b)
print(course.map(lambda a:(a[0],a[1][0]/a[1][1])).filter(lambda a:a[1] >= 60).collect())

df.groupBy('name').avg().where('avg(score) >= 60').show()

spark.sql('SELECT name,AVG(score) AS avg FROM sql GROUP BY name HAVING avg >= 60').show()
[('Aaron', 68.5), ('Abraham', 62.666666666666664), ('Adair', 68.66666666666667), ('Adolph', 67.5), ('Alan', 77.6), ('Albert', 66.0), ('Aldrich', 73.33333333333333), ('Alfred', 78.0), ('Alston', 69.0), ('Alvin', 74.0), ('Andrew', 68.0), ('Antony', 68.8), ('Baron', 60.0), ('Barton', 90.0), ('Basil', 61.5), ('Benedict', 70.33333333333333), ('Benjamin', 77.5), ('Benson', 60.0), ('Berg', 69.5), ('Bernard', 72.0), ('Bevis', 74.0), ('Bill', 71.0), ('Bing', 62.666666666666664), ('Blake', 63.0), ('Blithe', 81.33333333333333), ('Brady', 64.8), ('Brandon', 64.4), ('Brook', 63.0), ('Bruce', 62.666666666666664), ('Bruno', 68.4), ('Chapman', 60.5), ('Christ', 78.0), ('Clarence', 70.28571428571429), ('Claude', 61.0), ('Cleveland', 73.0), ('Clyde', 71.71428571428571), ('Colby', 62.5), ('Conrad', 62.0), ('Dean', 60.857142857142854), ('Dempsey', 67.0), ('Dennis', 64.0), ('Duncann', 63.2), ('Elliot', 78.0), ('Elroy', 61.6), ('Eugene', 80.0), ('Frank', 64.0), ('Gilbert', 82.66666666666667), ('Harold', 60.5), ('Jeffrey', 64.0), ('Jeremy', 73.0), ('Jesse', 64.0), ('Ken', 67.33333333333333), ('Len', 67.6), ('Leopold', 63.142857142857146), ('Lester', 86.5), ('Levi', 61.0), ('Lou', 70.0), ('Lucien', 65.2), ('Marsh', 65.0), ('Martin', 73.33333333333333), ('Michael', 60.8), ('Mick', 60.0), ('Mike', 70.0), ('Miles', 61.666666666666664), ('Milo', 66.0), ('Newman', 68.0), ('Nick', 64.8), ('Payne', 68.0), ('Perry', 62.0), ('Peter', 61.5), ('Randolph', 63.333333333333336), ('Raymondt', 73.66666666666667), ('Rock', 63.0), ('Rod', 63.0), ('Rory', 61.5), ('Sandy', 72.0), ('Simon', 80.0), ('Spencer', 64.0), ('Tracy', 60.0), ('Truman', 62.666666666666664), ('Upton', 67.6), ('Verne', 66.0), ('Vic', 61.333333333333336), ('Vincent', 65.6), ('Ward', 71.5), ('William', 68.0), ('Wright', 70.0)]
+--------+-----------------+
|    name|       avg(score)|
+--------+-----------------+
|     Rod|             63.0|
|   Baron|             60.0|
|     Len|             67.6|
|   Alvin|             74.0|
|  Elliot|             78.0|
|  Blithe|81.33333333333333|
|   Brady|             64.8|
|Benjamin|             77.5|
|   Bruno|             68.4|
|  Alston|             69.0|
|   Marsh|             65.0|
|  Adolph|             67.5|
| Spencer|             64.0|
|   Sandy|             72.0|
|   Adair|68.66666666666667|
|  Alfred|             78.0|
|   Upton|             67.6|
|   Tracy|             60.0|
|  Albert|             66.0|
|    Rock|             63.0|
+--------+-----------------+
only showing top 20 rows

+--------+-----------------+
|    name|              avg|
+--------+-----------------+
|     Rod|             63.0|
|   Baron|             60.0|
|     Len|             67.6|
|   Alvin|             74.0|
|  Elliot|             78.0|
|  Blithe|81.33333333333333|
|   Brady|             64.8|
|Benjamin|             77.5|
|   Bruno|             68.4|
|  Alston|             69.0|
|   Marsh|             65.0|
|  Adolph|             67.5|
| Spencer|             64.0|
|   Sandy|             72.0|
|   Adair|68.66666666666667|
|  Alfred|             78.0|
|   Upton|             67.6|
|   Tracy|             60.0|
|  Albert|             66.0|
|    Rock|             63.0|
+--------+-----------------+
only showing top 20 rows
# 找出平均分最高的10位同学
course = rdd.map(lambda a:(a[0],a[2])).combineByKey(lambda a:(a,1),lambda a,b:(a[0]+b,a[1]+1),lambda a,b:a+b)
print(course.map(lambda a:(a[0],a[1][0]/a[1][1])).sortBy(lambda a:a[1],False).take(10))

df2 = df.groupBy('name').avg()
df2.sort(-df2['avg(score)']).limit(10).show()

spark.sql('SELECT name,AVG(score) AS avg FROM sql GROUP BY name ORDER BY avg DESC LIMIT 10').show()
[('Barton', 90.0), ('Lester', 86.5), ('Gilbert', 82.66666666666667), ('Blithe', 81.33333333333333), ('Eugene', 80.0), ('Simon', 80.0), ('Alfred', 78.0), ('Christ', 78.0), ('Elliot', 78.0), ('Alan', 77.6)]
+-------+-----------------+
|   name|       avg(score)|
+-------+-----------------+
| Barton|             90.0|
| Lester|             86.5|
|Gilbert|82.66666666666667|
| Blithe|81.33333333333333|
|  Simon|             80.0|
| Eugene|             80.0|
| Elliot|             78.0|
| Alfred|             78.0|
| Christ|             78.0|
|   Alan|             77.6|
+-------+-----------------+

+-------+-----------------+
|   name|              avg|
+-------+-----------------+
| Barton|             90.0|
| Lester|             86.5|
|Gilbert|82.66666666666667|
| Blithe|81.33333333333333|
|  Simon|             80.0|
| Eugene|             80.0|
| Elliot|             78.0|
| Alfred|             78.0|
| Christ|             78.0|
|   Alan|             77.6|
+-------+-----------------+
posted @ 2022-05-27 21:21  coder-one  阅读(271)  评论(0)    收藏  举报