from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc=SparkContext(conf=conf)
url = "file:///home/hadoop/input/sc.txt"
rdd = sc.textFile(url).map(lambda line:line.split(',')).map(lambda line:[line[0],line[1],int(line[2])])
name = rdd.map(lambda a:(a[0],(a[1],a[2])))
from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df = spark.createDataFrame(rdd,['name','course','score'])
df.createOrReplaceTempView('sql')
# 总共有多少学生?
print('rdd:',rdd.map(lambda a:a[0]).distinct().count())
print('df:',df.select('name').distinct().count())
spark.sql('SELECT COUNT(DISTINCT(name)) FROM sql').show()
rdd: 265
df: 265
+--------------------+
|count(DISTINCT name)|
+--------------------+
| 265|
+--------------------+
# 总共开设了多少门课程?
print('rdd:',rdd.map(lambda a:a[1]).distinct().count())
print('df:',df.select('course').distinct().count())
spark.sql('SELECT COUNT(DISTINCT(course)) FROM sql').show()
rdd: 8
df: 8
+----------------------+
|count(DISTINCT course)|
+----------------------+
| 8|
+----------------------+
# 每个学生选修了多少门课?
print('rdd:',rdd.countByKey())
df.groupBy('name').count().show()
spark.sql('SELECT name,COUNT(course) FROM sql GROUP BY name').show()
rdd: defaultdict(<class 'int'>, {'Aaron': 4, 'Abbott': 3, 'Abel': 4, 'Abraham': 3, 'Adair': 3, 'Adam': 3, 'Adolph': 4, 'Adonis': 5, 'Alan': 5, 'Albert': 3, 'Aldrich': 3, 'Alexander': 4, 'Alfred': 2, 'Alger': 5, 'Allen': 4, 'Alston': 4, 'Alva': 5, 'Alvin': 5, 'Alvis': 6, 'Amos': 5, 'Andrew': 4, 'Andy': 3, 'Angelo': 2, 'Antony': 5, 'Antonio': 3, 'Archer': 5, 'Archibald': 5, 'Aries': 2, 'Arlen': 4, 'Armand': 3, 'Armstrong': 2, 'Baron': 6, 'Barry': 5, 'Bartholomew': 5, 'Bart': 5, 'Barton': 1, 'Basil': 4, 'Beck': 4, 'Ben': 4, 'Benedict': 6, 'Benjamin': 4, 'Bennett': 6, 'Benson': 4, 'Berg': 4, 'Bernard': 2, 'Bernie': 3, 'Bert': 3, 'Bertram': 3, 'Bevis': 4, 'Bill': 2, 'Bing': 6, 'Bishop': 2, 'Blair': 4, 'Blake': 4, 'Blithe': 3, 'Bob': 3, 'Booth': 6, 'Borg': 4, 'Boris': 6, 'Bowen': 5, 'Boyce': 2, 'Boyd': 3, 'Bradley': 2, 'Brady': 5, 'Brandon': 5, 'Brian': 6, 'Broderick': 3, 'Brook': 4, 'Bruce': 3, 'Bruno': 5, 'Chad': 6, 'Channing': 4, 'Chapman': 4, 'Charles': 3, 'Chester': 6, 'Christ': 2, 'Christian': 2, 'Christopher': 4, 'Clare': 4, 'Clarence': 7, 'Clark': 6, 'Claude': 2, 'Clement': 5, 'Cleveland': 4, 'Cliff': 5, 'Clyde': 7, 'Colbert': 4, 'Colby': 4, 'Colin': 5, 'Conrad': 2, 'Corey': 4, 'Dean': 7, 'Dempsey': 4, 'Dennis': 4, 'Derrick': 6, 'Devin': 4, 'Dick': 3, 'Dominic': 4, 'Don': 2, 'Donahue': 5, 'Donald': 4, 'Drew': 5, 'Duke': 4, 'Duncann': 5, 'Edward': 4, 'Egbert': 4, 'Eli': 5, 'Elijah': 4, 'Elliot': 3, 'Ellis': 4, 'Elmer': 4, 'Elroy': 5, 'Elton': 5, 'Elvis': 2, 'Emmanuel': 3, 'Enoch': 3, 'Eric': 4, 'Ernest': 5, 'Eugene': 1, 'Evan': 3, 'Ford': 3, 'Francis': 4, 'Frank': 3, 'Geoffrey': 4, 'George': 4, 'Gerald': 4, 'Gilbert': 3, 'Giles': 7, 'Glenn': 6, 'Gordon': 4, 'Greg': 4, 'Griffith': 4, 'Harlan': 6, 'Harold': 4, 'Harry': 4, 'Harvey': 7, 'Hayden': 3, 'Henry': 2, 'Herbert': 3, 'Herman': 3, 'Hilary': 4, 'Hiram': 6, 'Hobart': 4, 'Hogan': 4, 'Horace': 5, 'Ivan': 4, 'Jason': 4, 'Jay': 6, 'Jeff': 4, 'Jeffrey': 4, 'Jeremy': 6, 'Jerome': 3, 'Jerry': 3, 'Jesse': 7, 'Jim': 4, 'Jo': 5, 'John': 6, 'Jonas': 4, 'Jonathan': 4, 'Joseph': 3, 'Joshua': 4, 'Ken': 3, 'Kennedy': 4, 'Kenneth': 3, 'Kent': 4, 'Kerr': 4, 'Kerwin': 3, 'Kevin': 4, 'Kim': 4, 'Len': 5, 'Lennon': 4, 'Leo': 5, 'Leonard': 2, 'Leopold': 7, 'Les': 6, 'Lester': 4, 'Levi': 2, 'Lewis': 4, 'Lionel': 4, 'Lou': 2, 'Louis': 6, 'Lucien': 5, 'Luthers': 5, 'Marico': 6, 'Mark': 7, 'Marlon': 4, 'Marsh': 4, 'Marshall': 4, 'Martin': 3, 'Marvin': 3, 'Matt': 4, 'Matthew': 2, 'Maurice': 2, 'Max': 3, 'Maxwell': 4, 'Meredith': 4, 'Merle': 3, 'Merlin': 5, 'Michael': 5, 'Mick': 4, 'Mike': 3, 'Miles': 6, 'Milo': 2, 'Monroe': 3, 'Montague': 3, 'Nelson': 5, 'Newman': 2, 'Nicholas': 5, 'Nick': 5, 'Nigel': 3, 'Noah': 4, 'Payne': 6, 'Perry': 5, 'Pete': 3, 'Peter': 4, 'Phil': 3, 'Philip': 2, 'Randolph': 3, 'Raymondt': 6, 'Robin': 4, 'Rock': 6, 'Rod': 4, 'Roderick': 4, 'Rodney': 3, 'Ron': 6, 'Ronald': 3, 'Rory': 4, 'Roy': 6, 'Samuel': 4, 'Sandy': 1, 'Saxon': 7, 'Scott': 3, 'Sean': 6, 'Sebastian': 6, 'Sid': 3, 'Sidney': 5, 'Simon': 2, 'Solomon': 5, 'Spencer': 5, 'Stan': 3, 'Todd': 3, 'Tom': 5, 'Tony': 3, 'Tracy': 3, 'Truman': 3, 'Upton': 5, 'Uriah': 1, 'Valentine': 8, 'Verne': 3, 'Vic': 3, 'Victor': 2, 'Vincent': 5, 'Virgil': 5, 'Walter': 4, 'Ward': 4, 'Webb': 7, 'Webster': 2, 'Will': 3, 'William': 6, 'Willie': 4, 'Winfred': 3, 'Winston': 4, 'Woodrow': 3, 'Wordsworth': 4, 'Wright': 4})
+-----------+-----+
| name|count|
+-----------+-----+
| Chad| 6|
| Rod| 4|
| Baron| 6|
| Len| 5|
| Scott| 3|
| Donahue| 5|
| Gerald| 4|
| Monroe| 3|
| Boyd| 3|
| Alvin| 5|
| Elliot| 3|
| Blithe| 3|
| Willie| 4|
| Brady| 5|
|Bartholomew| 5|
| Sidney| 5|
| Benjamin| 4|
| Bruno| 5|
| Clare| 4|
| Herbert| 3|
+-----------+-----+
only showing top 20 rows
+-----------+-------------+
| name|count(course)|
+-----------+-------------+
| Chad| 6|
| Rod| 4|
| Baron| 6|
| Len| 5|
| Scott| 3|
| Donahue| 5|
| Gerald| 4|
| Monroe| 3|
| Boyd| 3|
| Alvin| 5|
| Elliot| 3|
| Blithe| 3|
| Willie| 4|
| Brady| 5|
|Bartholomew| 5|
| Sidney| 5|
| Benjamin| 4|
| Bruno| 5|
| Clare| 4|
| Herbert| 3|
+-----------+-------------+
only showing top 20 rows
# 每门课程有多少个学生选?
print('rdd:',name.values().countByKey())
df.groupBy('course').count().show()
spark.sql('SELECT course,COUNT(course) FROM sql GROUP BY course').show()
rdd: defaultdict(<class 'int'>, {'OperatingSystem': 134, 'Python': 136, 'ComputerNetwork': 142, 'Software': 132, 'DataBase': 126, 'Algorithm': 144, 'DataStructure': 131, 'CLanguage': 128})
+---------------+-----+
| course|count|
+---------------+-----+
|ComputerNetwork| 142|
| CLanguage| 128|
| DataStructure| 131|
| DataBase| 126|
| Algorithm| 144|
|OperatingSystem| 134|
| Software| 132|
| Python| 136|
+---------------+-----+
+---------------+-------------+
| course|count(course)|
+---------------+-------------+
|ComputerNetwork| 142|
| CLanguage| 128|
| DataStructure| 131|
| DataBase| 126|
| Algorithm| 144|
|OperatingSystem| 134|
| Software| 132|
| Python| 136|
+---------------+-------------+
# 每门课程>95分的学生人数
print(rdd.filter(lambda a:a[2]>95).map(lambda a:(a[1],a[2])).countByKey())
df.filter(df['score']>95).groupBy('course').count().show()
spark.sql('SELECT course,count(course) FROM sql WHERE score > 95 GROUP BY course').show()
defaultdict(<class 'int'>, {'OperatingSystem': 11, 'Software': 5, 'Python': 18, 'CLanguage': 14, 'Algorithm': 7, 'DataBase': 4, 'ComputerNetwork': 10, 'DataStructure': 5})
+---------------+-----+
| course|count|
+---------------+-----+
|ComputerNetwork| 10|
| CLanguage| 14|
| DataStructure| 5|
| DataBase| 4|
| Algorithm| 7|
|OperatingSystem| 11|
| Software| 5|
| Python| 18|
+---------------+-----+
+---------------+-------------+
| course|count(course)|
+---------------+-------------+
|ComputerNetwork| 10|
| CLanguage| 14|
| DataStructure| 5|
| DataBase| 4|
| Algorithm| 7|
|OperatingSystem| 11|
| Software| 5|
| Python| 18|
+---------------+-------------+
# 课程'Python'有多少个100分?
print('rdd:',rdd.filter(lambda a:a[1] == 'Python' and a[2] == 100).count())
print(df.filter(df['course'] == 'Python').filter(df['score'] == 100).count())
spark.sql('SELECT COUNT("Python") FROM sql WHERE course = "Python" and score = 100').show()
rdd: 5
5
+-------------+
|count(Python)|
+-------------+
| 5|
+-------------+
# Tom选修了几门课?每门课多少分?
print('rdd',rdd.filter(lambda a:a[0] == 'Tom').count(),rdd.filter(lambda a:a[0] == 'Tom').collect())
print('df',df.filter(df['name'] == 'Tom').count())
df.filter(df['name'] == 'Tom').show()
spark.sql('SELECT COUNT(course) FROM sql WHERE name = "Tom"').show()
spark.sql('SELECT course,score FROM sql WHERE name = "Tom"').show()
rdd 5 [['Tom', 'DataBase', 26], ['Tom', 'Algorithm', 12], ['Tom', 'OperatingSystem', 16], ['Tom', 'Python', 40], ['Tom', 'Software', 60]]
df 5
+----+---------------+-----+
|name| course|score|
+----+---------------+-----+
| Tom| DataBase| 26|
| Tom| Algorithm| 12|
| Tom|OperatingSystem| 16|
| Tom| Python| 40|
| Tom| Software| 60|
+----+---------------+-----+
+-------------+
|count(course)|
+-------------+
| 5|
+-------------+
+---------------+-----+
| course|score|
+---------------+-----+
| DataBase| 26|
| Algorithm| 12|
|OperatingSystem| 16|
| Python| 40|
| Software| 60|
+---------------+-----+
# Tom的成绩按分数大小排序
print(rdd.filter(lambda a:a[0] == 'Tom').sortBy(lambda a:a[2],False).collect())
df.filter(df['name'] == 'Tom').sort(-df['score']).show()
spark.sql('SELECT * FROM sql WHERE name = "Tom" ORDER BY score DESC').show()
[['Tom', 'Software', 60], ['Tom', 'Python', 40], ['Tom', 'DataBase', 26], ['Tom', 'OperatingSystem', 16], ['Tom', 'Algorithm', 12]]
+----+---------------+-----+
|name| course|score|
+----+---------------+-----+
| Tom| Software| 60|
| Tom| Python| 40|
| Tom| DataBase| 26|
| Tom|OperatingSystem| 16|
| Tom| Algorithm| 12|
+----+---------------+-----+
+----+---------------+-----+
|name| course|score|
+----+---------------+-----+
| Tom| Software| 60|
| Tom| Python| 40|
| Tom| DataBase| 26|
| Tom|OperatingSystem| 16|
| Tom| Algorithm| 12|
+----+---------------+-----+
# Tom选修了哪几门课
print(rdd.filter(lambda a:a[0] == 'Tom').collect())
df.filter(df['name'] == 'Tom').show()
spark.sql('SELECT * FROM sql WHERE name = "Tom"').show()
[['Tom', 'DataBase', 26], ['Tom', 'Algorithm', 12], ['Tom', 'OperatingSystem', 16], ['Tom', 'Python', 40], ['Tom', 'Software', 60]]
+----+---------------+-----+
|name| course|score|
+----+---------------+-----+
| Tom| DataBase| 26|
| Tom| Algorithm| 12|
| Tom|OperatingSystem| 16|
| Tom| Python| 40|
| Tom| Software| 60|
+----+---------------+-----+
+----+---------------+-----+
|name| course|score|
+----+---------------+-----+
| Tom| DataBase| 26|
| Tom| Algorithm| 12|
| Tom|OperatingSystem| 16|
| Tom| Python| 40|
| Tom| Software| 60|
+----+---------------+-----+
# Tom的平均分
import numpy as np
print(np.mean(name.filter(lambda a:a[0] == 'Tom').values().values().collect()))
df.filter(df['name'] == 'Tom').agg({'score':'avg'}).show()
spark.sql('SELECT AVG(score) FROM sql WHERE name = "Tom"').show()
30.8
+----------+
|avg(score)|
+----------+
| 30.8|
+----------+
+----------+
|avg(score)|
+----------+
| 30.8|
+----------+
# 'OperatingSystem'不及格人数
print('rdd',rdd.filter(lambda a:a[1] == 'OperatingSystem' and a[2] < 60).count())
print('df',df.filter(df['course'] == 'OperatingSystem').filter(df['score'] < 60).count())
spark.sql('SELECT COUNT(score) FROM sql WHERE course = "OperatingSystem" AND score < 60').show()
rdd 63
df 63
+------------+
|count(score)|
+------------+
| 63|
+------------+
# 'OperatingSystem'平均分
print('rdd',np.mean(rdd.filter(lambda a:a[1] == 'OperatingSystem').map(lambda a:(a[1],a[2])).values().collect()))
df.filter(df['course'] == 'OperatingSystem').agg({'score':'avg'}).show()
spark.sql('SELECT AVG(score) FROM sql WHERE course = "OperatingSystem"').show()
rdd 54.940298507462686
+------------------+
| avg(score)|
+------------------+
|54.940298507462686|
+------------------+
+------------------+
| avg(score)|
+------------------+
|54.940298507462686|
+------------------+
# 'OperatingSystem'90分以上人数
print('rdd',rdd.filter(lambda a:a[1] == 'OperatingSystem' and a[2] > 90).count())
print('df',df.filter(df['course'] == 'OperatingSystem').filter(df['score'] > 90).count())
spark.sql('SELECT COUNT("OperatingSystem > 90") FROM sql WHERE course = "OperatingSystem" and score >90').show()
rdd 15
df 15
+---------------------------+
|count(OperatingSystem > 90)|
+---------------------------+
| 15|
+---------------------------+
# 'OperatingSystem'前3名
print(rdd.filter(lambda a:a[1] == 'OperatingSystem').sortBy(lambda a:a[2],False).take(3))
print(df.filter(df['course'] == 'OperatingSystem').sort(-df['score']).head(3))
spark.sql('SELECT * FROM sql WHERE course = "OperatingSystem" ORDER BY score desc limit 3').show()
[['Aaron', 'OperatingSystem', 100], ['Bing', 'OperatingSystem', 100], ['Evan', 'OperatingSystem', 100]]
[Row(name='Harold', course='OperatingSystem', score=100), Row(name='Evan', course='OperatingSystem', score=100), Row(name='Aaron', course='OperatingSystem', score=100)]
+------+---------------+-----+
| name| course|score|
+------+---------------+-----+
|Harold|OperatingSystem| 100|
| Evan|OperatingSystem| 100|
| Aaron|OperatingSystem| 100|
+------+---------------+-----+
# 每个分数按比例+20平时分
print(rdd.map(lambda a:(a[0],a[1],a[2]+20)).take(3))
print(df.select(df['score']+20).head(3))
print(spark.sql('SELECT (score + 20) AS score FROM sql').head(3))
[('Aaron', 'OperatingSystem', 120), ('Aaron', 'Python', 70), ('Aaron', 'ComputerNetwork', 50)]
[Row((score + 20)=120), Row((score + 20)=70), Row((score + 20)=50)]
[Row(score=120), Row(score=70), Row(score=50)]
# 求每门课的平均分
course = rdd.map(lambda a:(a[1],a[2]))
print(course.combineByKey(lambda a:(a,1),lambda a,b:(a[0]+b,a[1]+1),lambda a,b:a+b).map(lambda a:(a[0],a[1][0]/a[1][1])).collect())
df.groupBy('course').avg('score').show()
spark.sql('SELECT course,AVG(score) FROM sql GROUP BY course').show()
[('OperatingSystem', 54.940298507462686), ('Python', 57.8235294117647), ('ComputerNetwork', 51.901408450704224), ('Software', 50.90909090909091), ('DataBase', 50.53968253968254), ('Algorithm', 48.833333333333336), ('DataStructure', 47.57251908396947), ('CLanguage', 50.609375)]
+---------------+------------------+
| course| avg(score)|
+---------------+------------------+
|ComputerNetwork|51.901408450704224|
| CLanguage| 50.609375|
| DataStructure| 47.57251908396947|
| DataBase| 50.53968253968254|
| Algorithm|48.833333333333336|
|OperatingSystem|54.940298507462686|
| Software| 50.90909090909091|
| Python| 57.8235294117647|
+---------------+------------------+
+---------------+------------------+
| course| avg(score)|
+---------------+------------------+
|ComputerNetwork|51.901408450704224|
| CLanguage| 50.609375|
| DataStructure| 47.57251908396947|
| DataBase| 50.53968253968254|
| Algorithm|48.833333333333336|
|OperatingSystem|54.940298507462686|
| Software| 50.90909090909091|
| Python| 57.8235294117647|
+---------------+------------------+
# 选修了7门课的有多少个学生?
print(rdd.map(lambda a:(a[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1] == 7).count())
print(df.groupby('name').count().where("count(name) == 7").count())
print(spark.sql('SELECT name,COUNT(name) AS count FROM sql GROUP BY name HAVING count = 7').count())
10
10
10
# 每门课大于95分的学生数
print(rdd.filter(lambda a:a[2]>95).map(lambda a:(a[1],a[2])).countByKey())
df.filter(df['score']>95).groupBy('course').count().show()
spark.sql('SELECT course,count(course) FROM sql WHERE score > 95 GROUP BY course').show()
defaultdict(<class 'int'>, {'OperatingSystem': 11, 'Software': 5, 'Python': 18, 'CLanguage': 14, 'Algorithm': 7, 'DataBase': 4, 'ComputerNetwork': 10, 'DataStructure': 5})
+---------------+-----+
| course|count|
+---------------+-----+
|ComputerNetwork| 10|
| CLanguage| 14|
| DataStructure| 5|
| DataBase| 4|
| Algorithm| 7|
|OperatingSystem| 11|
| Software| 5|
| Python| 18|
+---------------+-----+
+---------------+-------------+
| course|count(course)|
+---------------+-------------+
|ComputerNetwork| 10|
| CLanguage| 14|
| DataStructure| 5|
| DataBase| 4|
| Algorithm| 7|
|OperatingSystem| 11|
| Software| 5|
| Python| 18|
+---------------+-------------+
# 每门课的选修人数、平均分、不及格人数、通过率
course = rdd.map(lambda a:(a[1],a[2]))
print(course.countByKey())
print(course.combineByKey(lambda a:(a,1),lambda a,b:(a[0]+b,a[1]+1),lambda a,b:a+b).map(lambda a:(a[0],a[1][0]/a[1][1])).collect())
print(course.filter(lambda a:a[1] < 60).countByKey())
rdd.map(lambda a:(a[1], 1)).reduceByKey(lambda a,b:a+b).join(
rdd.map(lambda a:(a[1], a[2])).filter(lambda a:int(a[1]) < 60).map(lambda a:(a[0], 1)).reduceByKey(lambda a,b:a+b)
).map(lambda a:(a[0], round(1-(a[1][1]/a[1][0]),2))).collect()
defaultdict(<class 'int'>, {'OperatingSystem': 134, 'Python': 136, 'ComputerNetwork': 142, 'Software': 132, 'DataBase': 126, 'Algorithm': 144, 'DataStructure': 131, 'CLanguage': 128})
[('OperatingSystem', 54.940298507462686), ('Python', 57.8235294117647), ('ComputerNetwork', 51.901408450704224), ('Software', 50.90909090909091), ('DataBase', 50.53968253968254), ('Algorithm', 48.833333333333336), ('DataStructure', 47.57251908396947), ('CLanguage', 50.609375)]
defaultdict(<class 'int'>, {'Python': 67, 'ComputerNetwork': 80, 'DataBase': 71, 'Algorithm': 90, 'DataStructure': 84, 'OperatingSystem': 63, 'Software': 76, 'CLanguage': 78})
[('OperatingSystem', 0.53),
('Python', 0.51),
('ComputerNetwork', 0.44),
('Software', 0.42),
('DataBase', 0.44),
('Algorithm', 0.38),
('DataStructure', 0.36),
('CLanguage', 0.39)]
df1 = df.groupBy('course').count().withColumnRenamed('count','totalCount').join(
df.groupBy('course').avg().withColumnRenamed('avg(score)','avg'),"course"
).join(
df.filter(df['score'] < 60).groupBy('course').count().withColumnRenamed('count','notPass'),'course'
)
df1.select(
'course','totalCount','avg','notPass',1-df1['notPass']/df1['totalCount']
).withColumnRenamed('(1 - (notPass / totalCount))','passRat').show()
+---------------+----------+------------------+-------+-------------------+
| course|totalCount| avg|notPass| passRat|
+---------------+----------+------------------+-------+-------------------+
|ComputerNetwork| 142|51.901408450704224| 80|0.43661971830985913|
| CLanguage| 128| 50.609375| 78| 0.390625|
| DataStructure| 131| 47.57251908396947| 84| 0.3587786259541985|
| DataBase| 126| 50.53968253968254| 71| 0.4365079365079365|
| Algorithm| 144|48.833333333333336| 90| 0.375|
|OperatingSystem| 134|54.940298507462686| 63| 0.5298507462686567|
| Software| 132| 50.90909090909091| 76| 0.4242424242424242|
| Python| 136| 57.8235294117647| 67| 0.5073529411764706|
+---------------+----------+------------------+-------+-------------------+
spark.sql('SELECT course, COUNT(name) AS totalCount,avg(score) AS avg FROM sql GROUP BY course').createOrReplaceTempView("a")
spark.sql('SELECT course,COUNT(score) AS notPass FROM sql WHERE score<60 GROUP BY course').createOrReplaceTempView("b")
spark.sql('SELECT a.course,a.totalCount,round(a.avg,2)AS avg,b.notPass ,round((a.totalCount-b.notPass)/a.totalCount,2) AS passRat FROM a left JOIN b ON a.course=b.course').show()
+---------------+----------+-----+-------+-------+
| course|totalCount| avg|notPass|passRat|
+---------------+----------+-----+-------+-------+
|ComputerNetwork| 142| 51.9| 80| 0.44|
| CLanguage| 128|50.61| 78| 0.39|
| DataStructure| 131|47.57| 84| 0.36|
| DataBase| 126|50.54| 71| 0.44|
| Algorithm| 144|48.83| 90| 0.38|
|OperatingSystem| 134|54.94| 63| 0.53|
| Software| 132|50.91| 76| 0.42|
| Python| 136|57.82| 67| 0.51|
+---------------+----------+-----+-------+-------+
# 优秀、良好、通过和不合格各有多少人?
print('优秀:',rdd.filter(lambda a:a[2] >= 90).count())
print('良好:',rdd.filter(lambda a:a[2] >= 75 and a[2] < 90).count())
print('通过:',rdd.filter(lambda a:a[2] >= 60 and a[2] < 75).count())
print('不及格:',rdd.filter(lambda a:a[2] < 60).count())
优秀: 138
良好: 143
通过: 183
不及格: 609
print('优秀:',df.filter(df['score'] >= 90).count())
print('良好:',df.filter((df['score'] >= 75) & (df['score'] < 90)).count())
print('通过:',df.filter((df['score'] >= 60) & (df['score'] < 75)).count())
print('不及格:',df.filter(df['score'] < 60).count())
优秀: 138
良好: 143
通过: 183
不及格: 609
spark.sql('SELECT count("优秀") FROM sql WHERE score >= 90').show()
spark.sql('SELECT count("良好") FROM sql WHERE score >= 75 AND score < 90').show()
spark.sql('SELECT count("通过") FROM sql WHERE score >= 60 AND score < 75').show()
spark.sql('SELECT count("不及格") FROM sql WHERE score < 60').show()
+-----------+
|count(优秀)|
+-----------+
| 138|
+-----------+
+-----------+
|count(良好)|
+-----------+
| 143|
+-----------+
+-----------+
|count(通过)|
+-----------+
| 183|
+-----------+
+-------------+
|count(不及格)|
+-------------+
| 609|
+-------------+
# 同时选修了DataStructure和 DataBase 的学生
DAD = rdd.filter(lambda a:a[1] == 'DataStructure' or a[1] == 'DataBase')
print(DAD.map(lambda a:(a[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1] > 1).map(lambda a:a[0]).collect())
print(df[(df['course'] == 'DataStructure') | (df['course'] == 'DataBase')].groupBy('name').count().where('count > 1').select('name').show())
spark.sql('SELECT name,COUNT(name) AS count FROM sql WHERE course = "DataStructure" OR course = "DataBase" \
GROUP BY name HAVING count > 1').select('name').show()
['Adonis', 'Alva', 'Amos', 'Armand', 'Bart', 'Beck', 'Benedict', 'Bennett', 'Bing', 'Bowen', 'Brady', 'Brandon', 'Bruno', 'Chad', 'Chester', 'Christopher', 'Clarence', 'Clark', 'Clyde', 'Colby', 'Dean', 'Devin', 'Donahue', 'Edward', 'Elroy', 'Elton', 'Elvis', 'Giles', 'Harvey', 'Hiram', 'Hobart', 'Jeff', 'Jeremy', 'Jerome', 'Jo', 'Kennedy', 'Kent', 'Kevin', 'Kim', 'Leopold', 'Les', 'Louis', 'Marico', 'Mark', 'Matt', 'Miles', 'Nelson', 'Nicholas', 'Payne', 'Peter', 'Raymondt', 'Ron', 'Roy', 'Saxon', 'Sidney', 'Solomon', 'Upton', 'Valentine', 'Webb', 'William']
+-----------+
| name|
+-----------+
| Chad|
| Donahue|
| Brady|
| Sidney|
| Bruno|
| Valentine|
| Alva|
| Webb|
| Jo|
| Roy|
| Upton|
| Amos|
| Bennett|
| Payne|
| Les|
| Marico|
| Dean|
| Raymondt|
|Christopher|
| Elvis|
+-----------+
only showing top 20 rows
None
+-----------+
| name|
+-----------+
| Chad|
| Donahue|
| Brady|
| Sidney|
| Bruno|
| Valentine|
| Alva|
| Webb|
| Jo|
| Roy|
| Upton|
| Amos|
| Bennett|
| Payne|
| Les|
| Marico|
| Dean|
| Raymondt|
|Christopher|
| Elvis|
+-----------+
only showing top 20 rows
# 选修了DataStructure 但没有选修 DataBase 的学生
DOD = rdd.filter(lambda a:a[1] == 'DataStructure' or a[1] == 'DataBase')
print(DOD.map(lambda a:(a[0],a[1])).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1] == 'DataStructure').map(lambda a:a[0]).collect())
['Abel', 'Abraham', 'Adolph', 'Albert', 'Alexander', 'Alston', 'Alvis', 'Andrew', 'Aries', 'Arlen', 'Baron', 'Barry', 'Benjamin', 'Benson', 'Berg', 'Bernard', 'Bernie', 'Blithe', 'Boris', 'Boyd', 'Brian', 'Broderick', 'Brook', 'Channing', 'Christ', 'Christian', 'Clare', 'Cliff', 'Dempsey', 'Dennis', 'Dick', 'Drew', 'Duncann', 'Elmer', 'Evan', 'George', 'Greg', 'Griffith', 'Harold', 'Hayden', 'Hilary', 'Jay', 'Jeffrey', 'Jerry', 'Jesse', 'Joseph', 'Joshua', 'Lester', 'Lewis', 'Lionel', 'Lucien', 'Luthers', 'Marlon', 'Maurice', 'Max', 'Merlin', 'Mick', 'Mike', 'Perry', 'Pete', 'Randolph', 'Robin', 'Sandy', 'Spencer', 'Stan', 'Virgil', 'Ward', 'Willie', 'Winston', 'Wordsworth']
df1 = df.filter(df['course'] == 'DataBase').select('name','course').withColumnRenamed('course','DataBase')
df2 = df.filter(df['course'] == 'DataStructure').select('name','course').withColumnRenamed('course','DataStructure')
df3 = df1.join(df2,'name','right')
df3.filter(df3['DataBase'].isNull()).select('name').show()
+---------+
| name|
+---------+
| Baron|
| Boyd|
| Blithe|
| Willie|
| Benjamin|
| Clare|
| Alston|
| Hayden|
|Christian|
| Pete|
| Adolph|
| Spencer|
| Alvis|
| Barry|
| Sandy|
| Albert|
| Bernard|
| Merlin|
| Lester|
| Jeffrey|
+---------+
only showing top 20 rows
spark.sql('SELECT DISTINCT name,course AS DataStructure FROM sql WHERE course = "DataStructure"').createOrReplaceTempView("a")
spark.sql('SELECT DISTINCT name,course AS DataBase FROM sql WHERE course = "DataBase"').createOrReplaceTempView("b")
spark.sql('SELECT a.name FROM a LEFT JOIN b ON a.name = b.name WHERE DataBase is null').show()
+---------+
| name|
+---------+
| Baron|
| Boyd|
| Blithe|
| Willie|
| Benjamin|
| Clare|
| Alston|
| Hayden|
|Christian|
| Pete|
| Adolph|
| Spencer|
| Alvis|
| Barry|
| Sandy|
| Albert|
| Bernard|
| Merlin|
| Lester|
| Jeffrey|
+---------+
only showing top 20 rows
# 选修课程数少于3门的同学
print(rdd.map(lambda a:(a[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1]<3).map(lambda a:a[0]).collect())
df.groupBy(['name']).count().where("count < 3").select('name').show()
spark.sql('SELECT name,count(course) AS count FROM sql GROUP BY name HAVING count < 3').select('name').show()
['Alfred', 'Angelo', 'Aries', 'Armstrong', 'Barton', 'Bernard', 'Bill', 'Bishop', 'Boyce', 'Bradley', 'Christ', 'Christian', 'Claude', 'Conrad', 'Don', 'Elvis', 'Eugene', 'Henry', 'Leonard', 'Levi', 'Lou', 'Matthew', 'Maurice', 'Milo', 'Newman', 'Philip', 'Sandy', 'Simon', 'Uriah', 'Victor', 'Webster']
+---------+
| name|
+---------+
| Matthew|
|Christian|
| Sandy|
| Alfred|
| Bernard|
| Webster|
| Don|
| Bill|
| Elvis|
| Aries|
| Boyce|
| Lou|
| Simon|
| Christ|
| Maurice|
| Bradley|
|Armstrong|
| Milo|
| Uriah|
| Leonard|
+---------+
only showing top 20 rows
+---------+
| name|
+---------+
| Matthew|
|Christian|
| Sandy|
| Alfred|
| Bernard|
| Webster|
| Don|
| Bill|
| Elvis|
| Aries|
| Boyce|
| Lou|
| Simon|
| Christ|
| Maurice|
| Bradley|
|Armstrong|
| Milo|
| Uriah|
| Leonard|
+---------+
only showing top 20 rows
# 选修6门及以上课程数的同学
print(rdd.map(lambda a:(a[0],1)).reduceByKey(lambda a,b:a+b).filter(lambda a:a[1]>=6).map(lambda a:a[0]).collect())
df.groupBy('name').count().where('count >= 6').select('name').show()
spark.sql('SELECT name,count(name) AS count FROM sql GROUP BY name HAVING count >= 6').select('name').show()
['Alvis', 'Baron', 'Benedict', 'Bennett', 'Bing', 'Booth', 'Boris', 'Brian', 'Chad', 'Chester', 'Clarence', 'Clark', 'Clyde', 'Dean', 'Derrick', 'Giles', 'Glenn', 'Harlan', 'Harvey', 'Hiram', 'Jay', 'Jeremy', 'Jesse', 'John', 'Leopold', 'Les', 'Louis', 'Marico', 'Mark', 'Miles', 'Payne', 'Raymondt', 'Rock', 'Ron', 'Roy', 'Saxon', 'Sean', 'Sebastian', 'Valentine', 'Webb', 'William']
+---------+
| name|
+---------+
| Chad|
| Baron|
|Valentine|
| Webb|
| Roy|
| Alvis|
| Rock|
| Bennett|
| Payne|
| Les|
| Marico|
| Harlan|
| Dean|
| Raymondt|
| Harvey|
| Bing|
| Derrick|
| Saxon|
| Clarence|
| Sean|
+---------+
only showing top 20 rows
+---------+
| name|
+---------+
| Chad|
| Baron|
|Valentine|
| Webb|
| Roy|
| Alvis|
| Rock|
| Bennett|
| Payne|
| Les|
| Marico|
| Harlan|
| Dean|
| Raymondt|
| Harvey|
| Bing|
| Derrick|
| Saxon|
| Clarence|
| Sean|
+---------+
only showing top 20 rows
# 查询平均成绩大于等于60分的姓名和平均成绩
course = rdd.map(lambda a:(a[0],a[2])).combineByKey(lambda a:(a,1),lambda a,b:(a[0]+b,a[1]+1),lambda a,b:a+b)
print(course.map(lambda a:(a[0],a[1][0]/a[1][1])).filter(lambda a:a[1] >= 60).collect())
df.groupBy('name').avg().where('avg(score) >= 60').show()
spark.sql('SELECT name,AVG(score) AS avg FROM sql GROUP BY name HAVING avg >= 60').show()
[('Aaron', 68.5), ('Abraham', 62.666666666666664), ('Adair', 68.66666666666667), ('Adolph', 67.5), ('Alan', 77.6), ('Albert', 66.0), ('Aldrich', 73.33333333333333), ('Alfred', 78.0), ('Alston', 69.0), ('Alvin', 74.0), ('Andrew', 68.0), ('Antony', 68.8), ('Baron', 60.0), ('Barton', 90.0), ('Basil', 61.5), ('Benedict', 70.33333333333333), ('Benjamin', 77.5), ('Benson', 60.0), ('Berg', 69.5), ('Bernard', 72.0), ('Bevis', 74.0), ('Bill', 71.0), ('Bing', 62.666666666666664), ('Blake', 63.0), ('Blithe', 81.33333333333333), ('Brady', 64.8), ('Brandon', 64.4), ('Brook', 63.0), ('Bruce', 62.666666666666664), ('Bruno', 68.4), ('Chapman', 60.5), ('Christ', 78.0), ('Clarence', 70.28571428571429), ('Claude', 61.0), ('Cleveland', 73.0), ('Clyde', 71.71428571428571), ('Colby', 62.5), ('Conrad', 62.0), ('Dean', 60.857142857142854), ('Dempsey', 67.0), ('Dennis', 64.0), ('Duncann', 63.2), ('Elliot', 78.0), ('Elroy', 61.6), ('Eugene', 80.0), ('Frank', 64.0), ('Gilbert', 82.66666666666667), ('Harold', 60.5), ('Jeffrey', 64.0), ('Jeremy', 73.0), ('Jesse', 64.0), ('Ken', 67.33333333333333), ('Len', 67.6), ('Leopold', 63.142857142857146), ('Lester', 86.5), ('Levi', 61.0), ('Lou', 70.0), ('Lucien', 65.2), ('Marsh', 65.0), ('Martin', 73.33333333333333), ('Michael', 60.8), ('Mick', 60.0), ('Mike', 70.0), ('Miles', 61.666666666666664), ('Milo', 66.0), ('Newman', 68.0), ('Nick', 64.8), ('Payne', 68.0), ('Perry', 62.0), ('Peter', 61.5), ('Randolph', 63.333333333333336), ('Raymondt', 73.66666666666667), ('Rock', 63.0), ('Rod', 63.0), ('Rory', 61.5), ('Sandy', 72.0), ('Simon', 80.0), ('Spencer', 64.0), ('Tracy', 60.0), ('Truman', 62.666666666666664), ('Upton', 67.6), ('Verne', 66.0), ('Vic', 61.333333333333336), ('Vincent', 65.6), ('Ward', 71.5), ('William', 68.0), ('Wright', 70.0)]
+--------+-----------------+
| name| avg(score)|
+--------+-----------------+
| Rod| 63.0|
| Baron| 60.0|
| Len| 67.6|
| Alvin| 74.0|
| Elliot| 78.0|
| Blithe|81.33333333333333|
| Brady| 64.8|
|Benjamin| 77.5|
| Bruno| 68.4|
| Alston| 69.0|
| Marsh| 65.0|
| Adolph| 67.5|
| Spencer| 64.0|
| Sandy| 72.0|
| Adair|68.66666666666667|
| Alfred| 78.0|
| Upton| 67.6|
| Tracy| 60.0|
| Albert| 66.0|
| Rock| 63.0|
+--------+-----------------+
only showing top 20 rows
+--------+-----------------+
| name| avg|
+--------+-----------------+
| Rod| 63.0|
| Baron| 60.0|
| Len| 67.6|
| Alvin| 74.0|
| Elliot| 78.0|
| Blithe|81.33333333333333|
| Brady| 64.8|
|Benjamin| 77.5|
| Bruno| 68.4|
| Alston| 69.0|
| Marsh| 65.0|
| Adolph| 67.5|
| Spencer| 64.0|
| Sandy| 72.0|
| Adair|68.66666666666667|
| Alfred| 78.0|
| Upton| 67.6|
| Tracy| 60.0|
| Albert| 66.0|
| Rock| 63.0|
+--------+-----------------+
only showing top 20 rows
# 找出平均分最高的10位同学
course = rdd.map(lambda a:(a[0],a[2])).combineByKey(lambda a:(a,1),lambda a,b:(a[0]+b,a[1]+1),lambda a,b:a+b)
print(course.map(lambda a:(a[0],a[1][0]/a[1][1])).sortBy(lambda a:a[1],False).take(10))
df2 = df.groupBy('name').avg()
df2.sort(-df2['avg(score)']).limit(10).show()
spark.sql('SELECT name,AVG(score) AS avg FROM sql GROUP BY name ORDER BY avg DESC LIMIT 10').show()
[('Barton', 90.0), ('Lester', 86.5), ('Gilbert', 82.66666666666667), ('Blithe', 81.33333333333333), ('Eugene', 80.0), ('Simon', 80.0), ('Alfred', 78.0), ('Christ', 78.0), ('Elliot', 78.0), ('Alan', 77.6)]
+-------+-----------------+
| name| avg(score)|
+-------+-----------------+
| Barton| 90.0|
| Lester| 86.5|
|Gilbert|82.66666666666667|
| Blithe|81.33333333333333|
| Simon| 80.0|
| Eugene| 80.0|
| Elliot| 78.0|
| Alfred| 78.0|
| Christ| 78.0|
| Alan| 77.6|
+-------+-----------------+
+-------+-----------------+
| name| avg|
+-------+-----------------+
| Barton| 90.0|
| Lester| 86.5|
|Gilbert|82.66666666666667|
| Blithe|81.33333333333333|
| Simon| 80.0|
| Eugene| 80.0|
| Elliot| 78.0|
| Alfred| 78.0|
| Christ| 78.0|
| Alan| 77.6|
+-------+-----------------+