Spark之算子学习
一、算子详解
spark支持两种类型的操作:transformations和actions,transformations是从一个已经存在的数据集创造一个新的数据集,actions是对数据集进行计算后返回的值。
(一)Transformations
所有的transformation是lazy的,这也就是说执行transformation类型的算子,不会立即执行出结果而是将操作的行为进行记录。transformation的算子只有当执行action时才会进行触发。
1、map(func)
将func函数作用到数据集的每一个元素上,返回一个新的分布式数据集。
def my_map(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.map(lambda x:x+1) print(type(rdd2.collect())) #<class 'list'> print(rdd2.collect()) #[2, 3, 4, 5, 6] sc.stop()
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_map(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.map(lambda x:x+1) print(type(rdd2.collect())) #<class 'list'> print(rdd2.collect()) #[2, 3, 4, 5, 6] sc.stop() if __name__ == '__main__': my_map()
2、filter(func)
过滤出所有func返回值为True的元素,返回一个新的分布式数据集。
def my_filter(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.filter(lambda x:x>3) print(type(rdd2.collect())) #<class 'list'> print(rdd2.collect()) #[4, 5] sc.stop()
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_filter(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.filter(lambda x:x>3) print(type(rdd2.collect())) #<class 'list'> print(rdd2.collect()) #[4, 5] sc.stop() if __name__ == '__main__': my_filter()
3、flatMap(func)
输入的item能够被map到0或者多个items输出,返回值是一个Sequence。
def my_flatMap(): sc = getsc() data = ["Hello Word","Hello Boy","Hello Girl"] rdd1 = sc.parallelize(data) rdd2 = rdd1.flatMap(lambda x:x.split(" ")) print(rdd2.collect()) #['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_flatMap(): sc = getsc() data = ["Hello Word","Hello Boy","Hello Girl"] rdd1 = sc.parallelize(data) rdd2 = rdd1.flatMap(lambda x:x.split(" ")) print(rdd2.collect()) #['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl'] sc.stop() if __name__ == '__main__': my_flatMap()
4、groupByKey
将相同的Key的数据分发到一起。
def my_groupByKey(): sc = getsc() data = ["Hello Word","Hello Boy","Hello Girl"] rdd1 = sc.parallelize(data) rdd2 = rdd1.flatMap(lambda x: x.split(" ")) ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl'] rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)] groupByRdd = rdd3.groupByKey() ##.collect() [('Hello', <pyspark.resultiterable.ResultIterable object at 0x0000000003168160>),..] rdd4 = groupByRdd.map(lambda x:{x[0]:list(x[1])}).collect() #[{'Hello': [1, 1, 1]}, {'Boy': [1]}, {'Word': [1]}, {'Girl': [1]}]
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_groupByKey(): sc = getsc() data = ["Hello Word","Hello Boy","Hello Girl"] rdd1 = sc.parallelize(data) rdd2 = rdd1.flatMap(lambda x: x.split(" ")).collect() ##['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl'] rdd3 = rdd2.map(lambda x:(x,1)).collect() #[('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)] groupByRdd = rdd3.groupByKey().collect() ##[('Hello', <pyspark.resultiterable.ResultIterable object at 0x0000000003168160>),..] rdd4 = groupByRdd.map(lambda x:{x[0]:list(x[1])}).collect() #[{'Hello': [1, 1, 1]}, {'Boy': [1]}, {'Word': [1]}, {'Girl': [1]}] if __name__ == '__main__': my_groupByKey()
groupByKey就是将一个个的元组(首先需要是元祖组成的rdd),按照Key值进行分组,将Key相同对应的值放入另一个列表中。
5、reduceByKey
将相同的Key的数据分发到一起,并且进行计算。
def my_reduceByKey(): sc = getsc() data = ["Hello Word", "Hello Boy", "Hello Girl"] rdd1 = sc.parallelize(data) rdd2 = rdd1.flatMap(lambda x: x.split(" ")) ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl'] rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)] rdd4 = rdd3.reduceByKey(lambda a,b:a+b).collect() print(rdd4) #[('Hello', 3), ('Boy', 1), ('Word', 1), ('Girl', 1)]
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_reduceByKey(): sc = getsc() data = ["Hello Word", "Hello Boy", "Hello Girl"] rdd1 = sc.parallelize(data) rdd2 = rdd1.flatMap(lambda x: x.split(" ")) ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl'] rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)] rdd4 = rdd3.reduceByKey(lambda a,b:a+b).collect() print(rdd4) #[('Hello', 3), ('Boy', 1), ('Word', 1), ('Girl', 1)] if __name__ == '__main__': my_reduceByKey()
这个方法比groupByKey多实现了一个步骤,就是已经将Key相同的值得列表进行求和运算了。
6、sortByKey
现在加入相对上面的值进行排序,按照个数的大小进行排序,sortByKey是对Key值,也就是每一个单词进行排序,这样显然不对,所以需要先进行处理。
def my_sortByKey(): sc = getsc() data = ["Hello Word", "Hello Boy", "Hello Girl"] rdd1 = sc.parallelize(data) rdd2 = rdd1.flatMap(lambda x: x.split(" ")) ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl'] rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)] rdd4 = rdd3.reduceByKey(lambda a,b:a+b) # .collect() #[('Hello', 3), ('Boy', 1), ('Word', 1), ('Girl', 1)] rdd5 = rdd4.map(lambda x:(x[1],x[0])) # .collect() [(3, 'Hello'), (1, 'Boy'), (1, 'Word'), (1, 'Girl')] rdd6 = rdd5.sortByKey().collect() print(rdd6) #[(1, 'Boy'), (1, 'Word'), (1, 'Girl'), (3, 'Hello')]
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_sortByKey(): sc = getsc() data = ["Hello Word", "Hello Boy", "Hello Girl"] rdd1 = sc.parallelize(data) rdd2 = rdd1.flatMap(lambda x: x.split(" ")) ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl'] rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)] rdd4 = rdd3.reduceByKey(lambda a,b:a+b) # .collect() #[('Hello', 3), ('Boy', 1), ('Word', 1), ('Girl', 1)] rdd5 = rdd4.map(lambda x:(x[1],x[0])) # .collect() [(3, 'Hello'), (1, 'Boy'), (1, 'Word'), (1, 'Girl')] rdd6 = rdd5.sortByKey().collect() print(rdd6) #[(1, 'Boy'), (1, 'Word'), (1, 'Girl'), (3, 'Hello')] if __name__ == '__main__': my_sortByKey()
将最后的结果再通过map变换一下即可,默认的是升序,当然你可以通过向sortByKey方法传递Boolean值,进行想要的升降序。
7、union
def my_union(): sc = getsc() data1 = [1,2,3] data2 = [4,5,6] rdd1 = sc.parallelize(data1) rdd2 = sc.parallelize(data2) rdd3 = rdd1.union(rdd2) print(rdd3.collect()) #[1, 2, 3, 4, 5, 6]
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_union(): sc = getsc() data1 = [1,2,3] data2 = [4,5,6] rdd1 = sc.parallelize(data1) rdd2 = sc.parallelize(data2) rdd3 = rdd1.union(rdd2) print(rdd3.collect()) #[1, 2, 3, 4, 5, 6] if __name__ == '__main__': my_union()
8、distinct
def my_distinct(): sc = getsc() data = [11,2,2,3,5,5] rdd = sc.parallelize(data).distinct() print(rdd.collect()) #[2, 3, 11, 5] sc.stop()
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_distinct(): sc = getsc() data = [11,2,2,3,5,5] rdd = sc.parallelize(data).distinct() print(rdd.collect()) #[2, 3, 11, 5] if __name__ == '__main__': my_distinct()
9、join连接
这与关系型数据一样,包括内连接、外左连接、外右连接以及全连接。
def my_join(): sc = getsc() data1 = [('a',1),('b',2),('c',3)] data2 = [('a',4),('d',2),('e',5)] rdd1 = sc.parallelize(data1) rdd2 = sc.parallelize(data2) print(rdd1.join(rdd2).collect()) #inner [('a', (1, 4))] print(rdd1.leftOuterJoin(rdd2).collect()) #left join [('b', (2, None)), ('c', (3, None)), ('a', (1, 4))] print(rdd1.rightOuterJoin(rdd2).collect()) #right join [('a', (1, 4)), ('e', (None, 5)), ('d', (None, 2))] print(rdd1.fullOuterJoin(rdd2).collect()) #full join [('b', (2, None)), ('c', (3, None)), ('a', (1, 4)), ('e', (None, 5)), ('d', (None, 2))]
完整代码:
from pyspark import SparkConf,SparkContext def getsc(): # 创建SparkConf,进行Spark配置 conf = SparkConf().setMaster("local[2]").setAppName("sparktest") # 创建SparkContext sc = SparkContext(conf=conf) return sc def my_join(): sc = getsc() data1 = [('a',1),('b',2),('c',3)] data2 = [('a',4),('d',2),('e',5)] rdd1 = sc.parallelize(data1) rdd2 = sc.parallelize(data2) print(rdd1.join(rdd2).collect()) #inner [('a', (1, 4))] print(rdd1.leftOuterJoin(rdd2).collect()) #left join [('b', (2, None)), ('c', (3, None)), ('a', (1, 4))] print(rdd1.rightOuterJoin(rdd2).collect()) #right join [('a', (1, 4)), ('e', (None, 5)), ('d', (None, 2))] print(rdd1.fullOuterJoin(rdd2).collect()) #full join [('b', (2, None)), ('c', (3, None)), ('a', (1, 4)), ('e', (None, 5)), ('d', (None, 2))] if __name__ == '__main__': my_join()
(一)Actions
1、collect
对数据进行输出:
def my_collect(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.map(lambda x:x+1) print(type(rdd2.collect())) #<class 'list'> print(rdd2.collect()) #[2, 3, 4, 5, 6]
2、count
输出个数
def my_count(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.map(lambda x:x+1) print(rdd2.count()) #5
3、take
取出指定个数的元素
def my_take(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.map(lambda x:x+1) print(rdd2.take(2)) #[2, 3]
4、reduce(func)
func对元素进行累计操作
def my_reduce(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.map(lambda x:x+1) print(rdd2.reduce(lambda a,b:a*b)) #720
5、foreach
def my_foreach(): data = [1,2,3,4,5] sc = getsc() rdd1 = sc.parallelize(data) rdd2 = rdd1.map(lambda x:x+1) rdd2.foreach(lambda x:print(x)) #循环打印3,4,5,6 sc.stop()
另外,还有很多actions操作,比如sum、max等等。
二、算子实战
(一)WordCount
1、测试文件
在I:\spark_project\data\wordcount.txt文件中:
Hello Boy
Hello Girl Hello Santy
2、配置parameters
将上面的文件路劲作为一个参数传递进去:

3、编写程序
- 通过flapMap将文本的每一行转成一个个的单词
- 通过map转成一个个(word,1)这样的元祖
- 通过reduceByKey进行合并统计
from pyspark import SparkConf,SparkContext import sys def getsc(): conf = SparkConf() sc = SparkContext(conf=conf) return sc def printCount(): sc = getsc() counts = sc.textFile(sys.argv[1]).flatMap(lambda line:line.split(" ")).\ map(lambda x:(x,1)).\ reduceByKey(lambda a,b:a+b) output = counts.collect() for (word,count) in output: print(word,count) sc.stop() if __name__ == '__main__': if len(sys.argv) != 2: print("Usage:wordcount <input>",file=sys.stderr)
sys.exit(-1) printCount()
结果为:
Boy 1 Hello 3 Santy 1 Girl 1
4、spark环境中测试
将上述的的py文件以及txt文件上传到linux的/root/hadoopdata下,然后进入到spark的bin目录下,执行:
[root@hadoop-master bin]# ./spark-submit --master local[2] --name wordcount /root/hadoopdata/WordCount.py file:////root/hadoopdata/wordcount.txt
可以看到输出的日志有:
20/04/06 12:51:15 INFO Utils: Copying /root/hadoopdata/WordCount.py to /tmp/spark-d2f30a0f-24d2-4593-99fb-6e1308033ea1/userFiles-57519adb-7896-488b-a868-d8886277c7d0/WordCount.py
...
20/04/06 12:51:17 INFO FileInputFormat: Total input paths to process : 1
...
20/04/06 12:51:20 INFO DAGScheduler: Job 0 finished: collect at /root/hadoopdata/WordCount.py:13, took 2.360603 s
Boy 1
Hello 3
Santy 1
Girl 1
20/04/06 12:51:20 INFO SparkUI: Stopped Spark web UI at http://192.168.0.110:4041
此时运行出了结果。
这是读取一个文件,如果读物多个文件呢?比如在/root/hadoopdata/wc下有多个txt文件。
[root@hadoop-master wc]# ls wordcount1.txt wordcount.txt
执行命令:
[root@hadoop-master bin]# ./spark-submit --master local[2] --name wordcount /root/hadoopdata/WordCount.py file:////root/hadoopdata/wc
这样依旧可行,读取整个目录中的文件:
...
20/04/06 13:01:33 INFO DAGScheduler: Job 0 finished: collect at /root/hadoopdata/WordCount.py:13, took 2.088327 s
Boy 2
Hello 6
Santy 2
Girl 2
20/04/06 13:01:33 INFO SparkUI: Stopped Spark web UI at http://192.168.0.110:4041
..
另外也可以匹配读取,比如文件后缀名为.csv:.
[root@hadoop-master bin]# ./spark-submit --master local[2] --name wordcount /root/hadoopdata/WordCount.py file:////root/hadoopdata/wc/*.csv
5、写入文件
上面是将结果打印出来的,但往往需要将结果写入文件中:
from pyspark import SparkConf,SparkContext import sys def getsc(): conf = SparkConf() sc = SparkContext(conf=conf) return sc def saveFile(): sc = getsc() counts = sc.textFile(sys.argv[1]).flatMap(lambda line:line.split(" ")).\ map(lambda x:(x,1)).\ reduceByKey(lambda a,b:a+b) counts.saveAsTextFile(sys.argv[2]) #存入到文件 sc.stop() if __name__ == '__main__': if len(sys.argv) != 3: print("Usage:wordcount <input>",file=sys.stderr) sys.exit(-1) saveFile()
此时需要多配置一个参数:

相应的spark环境运行时也需要多一个输出的文件目录参数:
[root@hadoop-master bin]# ./spark-submit --master local[2] --name wordcount /root/hadoopdata/WordCount.py file:////root/hadoopdata/wc file:////root/hadoopdata/output
(二)TopN
1、测试文件
http://127.0.0.1 123456 pets click http://127.0.0.1 1923756 sports click http://127.0.0.1 123956 pets click http://127.0.0.1 123156 pets click http://127.0.0.1 1234056 pets click http://127.0.0.1 123416 books click http://127.0.0.1 123456 pets click http://127.0.0.1 123456 pets click http://127.0.0.1 123456 pets click http://127.0.0.1 1234056 pets click http://127.0.0.1 1234056 pets click
2、分析
上面的文件中包含地址、用户ID、用户爱好、用户行为,统计Top3的访问用户
- 通过map方法取出每一行呢关注的字段UserID
- 通过map形成(UserID,1)
- 通过ReduceByKey进行统计最终结果
- 通过SortByKey进行排序,取出Top3
3、实现
详细分解过程:
from pyspark import SparkConf,SparkContext import sys def getsc(): conf = SparkConf() sc = SparkContext(conf=conf) return sc def topN(): sc = getsc() fileContent = sc.textFile(sys.argv[1]) # print(fileContent) # print(fileContent.collect()) #将文件读入一个列表中 """ ['http://127.0.0.1 123456 pets click', 'http://127.0.0.1 1923756 sports click', 'http://127.0.0.1 123956 pets click', 'http://127.0.0.1 123156 pets click', 'http://127.0.0.1 1234056 pets click', 'http://127.0.0.1 123416 books click', 'http://127.0.0.1 123456 pets click', 'http://127.0.0.1 123456 pets click', 'http://127.0.0.1 123456 pets click', 'http://127.0.0.1 1234056 pets click', 'http://127.0.0.1 1234056 pets click' ] """ lineContent = fileContent.map(lambda line:line.split(" ")) # print(lineContent.collect()) """ [ ['http://127.0.0.1', '123456', 'pets', 'click'], ['http://127.0.0.1', '1923756', 'sports', 'click'], ['http://127.0.0.1', '123956', 'pets', 'click'], ['http://127.0.0.1', '123156', 'pets', 'click'], ['http://127.0.0.1', '1234056', 'pets', 'click'], ['http://127.0.0.1', '123416', 'books', 'click'], ['http://127.0.0.1', '123456', 'pets', 'click'], ['http://127.0.0.1', '123456', 'pets', 'click'], ['http://127.0.0.1', '123456', 'pets', 'click'], ['http://127.0.0.1', '1234056', 'pets', 'click'], ['http://127.0.0.1', '1234056', 'pets', 'click']] """ userIdRdd = lineContent.map(lambda x:x[1]) # print(userIdRdd.collect()) """ [ '123456', '1923756', '123956', '123156', '1234056', '123416', '123456', '123456', '123456', '1234056', '1234056' ] """ groupByKeyRdd = userIdRdd.map(lambda x:(x,1)).reduceByKey(lambda a,b:a+b) # print(groupByKeyRdd.collect()) """ [('1923756', 1), ('123956', 1), ('123416', 1), ('123156', 1), ('123456', 4), ('1234056', 3)] """ sortByKeyRdd = groupByKeyRdd.map(lambda x:(x[1],x[0])).sortByKey(False).map(lambda x:(x[1],x[0])) print(sortByKeyRdd.collect()) """ [('123456', 4), ('1234056', 3), ('1923756', 1), ('123956', 1), ('123416', 1), ('123156', 1)] """ #取出top3 Top3 = sortByKeyRdd.take(3) #[('123456', 4), ('1234056', 3), ('1923756', 1)] print(Top3) sc.stop() if __name__ == '__main__': if len(sys.argv) != 2: print("Usage:wordcount <input>",file=sys.stderr) sys.exit(-1) topN()
(三)求平均值
1、测试文件
1 17 2 36 3 60 4 49 5 14 6 51 7 27 8 36 9 45 10 27
2、分析
这是一个关于年龄统计的文件,求出所有年龄的平均值。
- 通过map取出关注的字段年龄的值
- 通过reduce计算所有年龄的总和
- 通过count计算数量
- 计算平均年龄
3、实现
from pyspark import SparkConf,SparkContext import sys def getsc(): conf = SparkConf() sc = SparkContext(conf=conf) return sc def avgAge(): sc = getsc() ageData = sc.textFile(sys.argv[1]).map(lambda x:x.split(" ")[1]) print(ageData.collect()) ##['17', '36', '60', '49', '14', '51', '27', '36', '45', '27'] totalAge =ageData.\ map(lambda x:int(x)).\ reduce(lambda a,b:a+b) numAge = ageData.count() avgAge = totalAge/numAge print(totalAge) # 362 print(numAge) #10 sc.stop() if __name__ == '__main__': if len(sys.argv) != 2: print("Usage:wordcount <input>",file=sys.stderr) sys.exit(-1) avgAge()


浙公网安备 33010602011771号