Fork me on GitHub

Spark之算子学习

一、算子详解

spark支持两种类型的操作:transformations和actions,transformations是从一个已经存在的数据集创造一个新的数据集,actions是对数据集进行计算后返回的值。

(一)Transformations

  所有的transformation是lazy的,这也就是说执行transformation类型的算子,不会立即执行出结果而是将操作的行为进行记录。transformation的算子只有当执行action时才会进行触发。

1、map(func)

将func函数作用到数据集的每一个元素上,返回一个新的分布式数据集。

def my_map():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.map(lambda x:x+1)
    print(type(rdd2.collect())) #<class 'list'>
    print(rdd2.collect())  #[2, 3, 4, 5, 6]
    sc.stop()

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc


def my_map():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.map(lambda x:x+1)
    print(type(rdd2.collect())) #<class 'list'>
    print(rdd2.collect())  #[2, 3, 4, 5, 6]
    sc.stop()


if __name__ == '__main__':
    my_map()
View Code

2、filter(func)

过滤出所有func返回值为True的元素,返回一个新的分布式数据集。

def my_filter():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.filter(lambda x:x>3) 
    print(type(rdd2.collect())) #<class 'list'>
    print(rdd2.collect())  #[4, 5]
    sc.stop()

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc


def my_filter():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.filter(lambda x:x>3)
    print(type(rdd2.collect())) #<class 'list'>
    print(rdd2.collect())  #[4, 5]
    sc.stop()


if __name__ == '__main__':
    my_filter()
View Code

3、flatMap(func)

输入的item能够被map到0或者多个items输出,返回值是一个Sequence。

def my_flatMap():
    sc = getsc()
    data = ["Hello Word","Hello Boy","Hello Girl"]
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.flatMap(lambda x:x.split(" "))
    print(rdd2.collect()) #['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc


def my_flatMap():
    sc = getsc()
    data = ["Hello Word","Hello Boy","Hello Girl"]
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.flatMap(lambda x:x.split(" "))
    print(rdd2.collect()) #['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']
    sc.stop()


if __name__ == '__main__':
    my_flatMap()
View Code

4、groupByKey

将相同的Key的数据分发到一起。

def my_groupByKey():
    sc = getsc()
    data = ["Hello Word","Hello Boy","Hello Girl"]
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.flatMap(lambda x: x.split(" ")) ##.collect()  ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']
    rdd3 = rdd2.map(lambda x:(x,1)) #.collect()   [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)]
    groupByRdd = rdd3.groupByKey() ##.collect() [('Hello', <pyspark.resultiterable.ResultIterable object at 0x0000000003168160>),..]
    rdd4 = groupByRdd.map(lambda x:{x[0]:list(x[1])}).collect() #[{'Hello': [1, 1, 1]}, {'Boy': [1]}, {'Word': [1]}, {'Girl': [1]}]

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc


def my_groupByKey():
    sc = getsc()
    data = ["Hello Word","Hello Boy","Hello Girl"]
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.flatMap(lambda x: x.split(" ")).collect() ##['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']
    rdd3 = rdd2.map(lambda x:(x,1)).collect() #[('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)]
    groupByRdd = rdd3.groupByKey().collect() ##[('Hello', <pyspark.resultiterable.ResultIterable object at 0x0000000003168160>),..]
    rdd4 = groupByRdd.map(lambda x:{x[0]:list(x[1])}).collect() #[{'Hello': [1, 1, 1]}, {'Boy': [1]}, {'Word': [1]}, {'Girl': [1]}]


if __name__ == '__main__':
    my_groupByKey()
View Code

groupByKey就是将一个个的元组(首先需要是元祖组成的rdd),按照Key值进行分组,将Key相同对应的值放入另一个列表中。

5、reduceByKey

将相同的Key的数据分发到一起,并且进行计算。

def my_reduceByKey():
    sc = getsc()
    data = ["Hello Word", "Hello Boy", "Hello Girl"]
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.flatMap(lambda x: x.split(" "))  ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']
    rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)]
    rdd4 = rdd3.reduceByKey(lambda a,b:a+b).collect() 
    print(rdd4) #[('Hello', 3), ('Boy', 1), ('Word', 1), ('Girl', 1)]

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc

def my_reduceByKey():
    sc = getsc()
    data = ["Hello Word", "Hello Boy", "Hello Girl"]
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.flatMap(lambda x: x.split(" "))  ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']
    rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)]
    rdd4 = rdd3.reduceByKey(lambda a,b:a+b).collect()
    print(rdd4) #[('Hello', 3), ('Boy', 1), ('Word', 1), ('Girl', 1)]

if __name__ == '__main__':
    my_reduceByKey()
View Code

这个方法比groupByKey多实现了一个步骤,就是已经将Key相同的值得列表进行求和运算了。

6、sortByKey

现在加入相对上面的值进行排序,按照个数的大小进行排序,sortByKey是对Key值,也就是每一个单词进行排序,这样显然不对,所以需要先进行处理。

def my_sortByKey():
    sc = getsc()
    data = ["Hello Word", "Hello Boy", "Hello Girl"]
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.flatMap(lambda x: x.split(" "))  ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']
    rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)]
    rdd4 = rdd3.reduceByKey(lambda a,b:a+b) # .collect() #[('Hello', 3), ('Boy', 1), ('Word', 1), ('Girl', 1)]
    rdd5 = rdd4.map(lambda x:(x[1],x[0])) # .collect() [(3, 'Hello'), (1, 'Boy'), (1, 'Word'), (1, 'Girl')]
    rdd6 = rdd5.sortByKey().collect()
    print(rdd6)  #[(1, 'Boy'), (1, 'Word'), (1, 'Girl'), (3, 'Hello')]

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc


def my_sortByKey():
    sc = getsc()
    data = ["Hello Word", "Hello Boy", "Hello Girl"]
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.flatMap(lambda x: x.split(" "))  ##.collect() ['Hello', 'Word', 'Hello', 'Boy', 'Hello', 'Girl']
    rdd3 = rdd2.map(lambda x:(x,1)) #.collect() [('Hello', 1), ('Word', 1), ('Hello', 1), ('Boy', 1), ('Hello', 1), ('Girl', 1)]
    rdd4 = rdd3.reduceByKey(lambda a,b:a+b) # .collect() #[('Hello', 3), ('Boy', 1), ('Word', 1), ('Girl', 1)]
    rdd5 = rdd4.map(lambda x:(x[1],x[0])) # .collect() [(3, 'Hello'), (1, 'Boy'), (1, 'Word'), (1, 'Girl')]
    rdd6 = rdd5.sortByKey().collect()
    print(rdd6)  #[(1, 'Boy'), (1, 'Word'), (1, 'Girl'), (3, 'Hello')]

if __name__ == '__main__':
    my_sortByKey()
View Code

将最后的结果再通过map变换一下即可,默认的是升序,当然你可以通过向sortByKey方法传递Boolean值,进行想要的升降序。

7、union

def my_union():
    sc = getsc()
    data1 = [1,2,3]
    data2 = [4,5,6]
    rdd1 = sc.parallelize(data1)
    rdd2 = sc.parallelize(data2)
    rdd3 = rdd1.union(rdd2)
    print(rdd3.collect()) #[1, 2, 3, 4, 5, 6]

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc


def my_union():
    sc = getsc()
    data1 = [1,2,3]
    data2 = [4,5,6]
    rdd1 = sc.parallelize(data1)
    rdd2 = sc.parallelize(data2)
    rdd3 = rdd1.union(rdd2)
    print(rdd3.collect()) #[1, 2, 3, 4, 5, 6]


if __name__ == '__main__':
    my_union()
View Code

8、distinct

def my_distinct():
    sc = getsc()
    data = [11,2,2,3,5,5]
    rdd = sc.parallelize(data).distinct()
    print(rdd.collect()) #[2, 3, 11, 5]
    sc.stop()

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc


def my_distinct():
    sc = getsc()
    data = [11,2,2,3,5,5]
    rdd = sc.parallelize(data).distinct()
    print(rdd.collect()) #[2, 3, 11, 5]


if __name__ == '__main__':
    my_distinct()
View Code

9、join连接

这与关系型数据一样,包括内连接、外左连接、外右连接以及全连接。

def my_join():
    sc = getsc()
    data1 = [('a',1),('b',2),('c',3)]
    data2 = [('a',4),('d',2),('e',5)]
    rdd1 = sc.parallelize(data1)
    rdd2 = sc.parallelize(data2)
    print(rdd1.join(rdd2).collect()) #inner  [('a', (1, 4))]
    print(rdd1.leftOuterJoin(rdd2).collect()) #left join  [('b', (2, None)), ('c', (3, None)), ('a', (1, 4))]
    print(rdd1.rightOuterJoin(rdd2).collect()) #right join  [('a', (1, 4)), ('e', (None, 5)), ('d', (None, 2))]
    print(rdd1.fullOuterJoin(rdd2).collect()) #full join  [('b', (2, None)), ('c', (3, None)), ('a', (1, 4)), ('e', (None, 5)), ('d', (None, 2))]

完整代码:

from pyspark import SparkConf,SparkContext


def getsc():
    # 创建SparkConf,进行Spark配置
    conf = SparkConf().setMaster("local[2]").setAppName("sparktest")

    # 创建SparkContext
    sc = SparkContext(conf=conf)
    return sc


def my_join():
    sc = getsc()
    data1 = [('a',1),('b',2),('c',3)]
    data2 = [('a',4),('d',2),('e',5)]
    rdd1 = sc.parallelize(data1)
    rdd2 = sc.parallelize(data2)
    print(rdd1.join(rdd2).collect()) #inner  [('a', (1, 4))]
    print(rdd1.leftOuterJoin(rdd2).collect()) #left join  [('b', (2, None)), ('c', (3, None)), ('a', (1, 4))]
    print(rdd1.rightOuterJoin(rdd2).collect()) #right join  [('a', (1, 4)), ('e', (None, 5)), ('d', (None, 2))]
    print(rdd1.fullOuterJoin(rdd2).collect()) #full join  [('b', (2, None)), ('c', (3, None)), ('a', (1, 4)), ('e', (None, 5)), ('d', (None, 2))]


if __name__ == '__main__':
    my_join()
View Code

(一)Actions

1、collect

对数据进行输出:

def my_collect():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.map(lambda x:x+1)
    print(type(rdd2.collect())) #<class 'list'>
    print(rdd2.collect())  #[2, 3, 4, 5, 6]

2、count

输出个数

def my_count():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.map(lambda x:x+1)
    print(rdd2.count()) #5

3、take

取出指定个数的元素

def my_take():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.map(lambda x:x+1)
    print(rdd2.take(2)) #[2, 3]

4、reduce(func)

func对元素进行累计操作

def my_reduce():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.map(lambda x:x+1)
    print(rdd2.reduce(lambda a,b:a*b)) #720

5、foreach

def my_foreach():
    data = [1,2,3,4,5]
    sc = getsc()
    rdd1 = sc.parallelize(data)
    rdd2 = rdd1.map(lambda x:x+1)
    rdd2.foreach(lambda x:print(x))  #循环打印3,4,5,6
    sc.stop()

另外,还有很多actions操作,比如sum、max等等。

 二、算子实战

(一)WordCount

1、测试文件

在I:\spark_project\data\wordcount.txt文件中:

Hello Boy
Hello Girl Hello Santy

2、配置parameters

将上面的文件路劲作为一个参数传递进去:

3、编写程序

  • 通过flapMap将文本的每一行转成一个个的单词
  • 通过map转成一个个(word,1)这样的元祖
  • 通过reduceByKey进行合并统计
from pyspark import SparkConf,SparkContext
import sys
def getsc():
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    return sc

def printCount():
    sc = getsc()
    counts = sc.textFile(sys.argv[1]).flatMap(lambda line:line.split(" ")).\
        map(lambda x:(x,1)).\
        reduceByKey(lambda a,b:a+b)
    output = counts.collect()
    for (word,count) in output:
        print(word,count)
    sc.stop()

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print("Usage:wordcount <input>",file=sys.stderr)
sys.exit(-1) printCount()

结果为:

Boy 1
Hello 3
Santy 1
Girl 1
View Code

4、spark环境中测试

 将上述的的py文件以及txt文件上传到linux的/root/hadoopdata下,然后进入到spark的bin目录下,执行:

[root@hadoop-master bin]# ./spark-submit --master local[2] --name wordcount /root/hadoopdata/WordCount.py file:////root/hadoopdata/wordcount.txt

可以看到输出的日志有:

20/04/06 12:51:15 INFO Utils: Copying /root/hadoopdata/WordCount.py to /tmp/spark-d2f30a0f-24d2-4593-99fb-6e1308033ea1/userFiles-57519adb-7896-488b-a868-d8886277c7d0/WordCount.py

...

20/04/06 12:51:17 INFO FileInputFormat: Total input paths to process : 1

...

20/04/06 12:51:20 INFO DAGScheduler: Job 0 finished: collect at /root/hadoopdata/WordCount.py:13, took 2.360603 s
Boy 1
Hello 3
Santy 1
Girl 1
20/04/06 12:51:20 INFO SparkUI: Stopped Spark web UI at http://192.168.0.110:4041

此时运行出了结果。

这是读取一个文件,如果读物多个文件呢?比如在/root/hadoopdata/wc下有多个txt文件。

[root@hadoop-master wc]# ls
wordcount1.txt  wordcount.txt

执行命令:

[root@hadoop-master bin]# ./spark-submit --master local[2] --name wordcount /root/hadoopdata/WordCount.py file:////root/hadoopdata/wc

这样依旧可行,读取整个目录中的文件:

...

20/04/06 13:01:33 INFO DAGScheduler: Job 0 finished: collect at /root/hadoopdata/WordCount.py:13, took 2.088327 s
Boy 2
Hello 6
Santy 2
Girl 2
20/04/06 13:01:33 INFO SparkUI: Stopped Spark web UI at http://192.168.0.110:4041

..

另外也可以匹配读取,比如文件后缀名为.csv:.

[root@hadoop-master bin]# ./spark-submit --master local[2] --name wordcount /root/hadoopdata/WordCount.py file:////root/hadoopdata/wc/*.csv

 5、写入文件

上面是将结果打印出来的,但往往需要将结果写入文件中:

from pyspark import SparkConf,SparkContext
import sys
def getsc():
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    return sc

    
def saveFile():
    sc = getsc()
    counts = sc.textFile(sys.argv[1]).flatMap(lambda line:line.split(" ")).\
        map(lambda x:(x,1)).\
        reduceByKey(lambda a,b:a+b)
    counts.saveAsTextFile(sys.argv[2]) #存入到文件
    sc.stop()

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("Usage:wordcount <input>",file=sys.stderr)
        sys.exit(-1)
    saveFile()

此时需要多配置一个参数:

 相应的spark环境运行时也需要多一个输出的文件目录参数:

[root@hadoop-master bin]# ./spark-submit --master local[2] --name wordcount /root/hadoopdata/WordCount.py file:////root/hadoopdata/wc
file:////root/hadoopdata/output

(二)TopN

1、测试文件

http://127.0.0.1 123456 pets click
http://127.0.0.1 1923756 sports click
http://127.0.0.1 123956 pets click
http://127.0.0.1 123156 pets click
http://127.0.0.1 1234056 pets click
http://127.0.0.1 123416 books click
http://127.0.0.1 123456 pets click
http://127.0.0.1 123456 pets click
http://127.0.0.1 123456 pets click
http://127.0.0.1 1234056 pets click
http://127.0.0.1 1234056 pets click
TopN.txt

2、分析

上面的文件中包含地址、用户ID、用户爱好、用户行为,统计Top3的访问用户

  • 通过map方法取出每一行呢关注的字段UserID
  • 通过map形成(UserID,1)
  • 通过ReduceByKey进行统计最终结果
  • 通过SortByKey进行排序,取出Top3

3、实现

详细分解过程:

from pyspark import SparkConf,SparkContext
import sys
def getsc():
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    return sc

def topN():
    sc = getsc()
    fileContent = sc.textFile(sys.argv[1])
    # print(fileContent)
    # print(fileContent.collect()) #将文件读入一个列表中
    """
    ['http://127.0.0.1 123456 pets click', 
    'http://127.0.0.1 1923756 sports click',
     'http://127.0.0.1 123956 pets click',
      'http://127.0.0.1 123156 pets click', 
    'http://127.0.0.1 1234056 pets click',
     'http://127.0.0.1 123416 books click', 
    'http://127.0.0.1 123456 pets click', 
    'http://127.0.0.1 123456 pets click', 
    'http://127.0.0.1 123456 pets click',
     'http://127.0.0.1 1234056 pets click',
     'http://127.0.0.1 1234056 pets click'
     ]
    """
    lineContent = fileContent.map(lambda line:line.split(" "))
    # print(lineContent.collect())
    """
    [
    ['http://127.0.0.1', '123456', 'pets', 'click'], 
    ['http://127.0.0.1', '1923756', 'sports', 'click'], 
    ['http://127.0.0.1', '123956', 'pets', 'click'], 
    ['http://127.0.0.1', '123156', 'pets', 'click'],
     ['http://127.0.0.1', '1234056', 'pets', 'click'],
      ['http://127.0.0.1', '123416', 'books', 'click'], 
      ['http://127.0.0.1', '123456', 'pets', 'click'], 
      ['http://127.0.0.1', '123456', 'pets', 'click'],
       ['http://127.0.0.1', '123456', 'pets', 'click'],
        ['http://127.0.0.1', '1234056', 'pets', 'click'], 
        ['http://127.0.0.1', '1234056', 'pets', 'click']]
    """
    userIdRdd = lineContent.map(lambda x:x[1])
    # print(userIdRdd.collect())
    """
    [
    '123456', '1923756', '123956', '123156', '1234056',
     '123416', '123456', '123456', '123456', '1234056', '1234056'
     ]
    """
    groupByKeyRdd = userIdRdd.map(lambda x:(x,1)).reduceByKey(lambda a,b:a+b)
    # print(groupByKeyRdd.collect())
    """
    [('1923756', 1), ('123956', 1), ('123416', 1), ('123156', 1), ('123456', 4), ('1234056', 3)]
    """
    sortByKeyRdd = groupByKeyRdd.map(lambda x:(x[1],x[0])).sortByKey(False).map(lambda x:(x[1],x[0]))
    print(sortByKeyRdd.collect())
    """
    [('123456', 4), ('1234056', 3), ('1923756', 1), ('123956', 1), ('123416', 1), ('123156', 1)]
    """
    #取出top3
    Top3 = sortByKeyRdd.take(3) #[('123456', 4), ('1234056', 3), ('1923756', 1)]
    print(Top3)
    sc.stop()


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print("Usage:wordcount <input>",file=sys.stderr)
        sys.exit(-1)
    topN()
View Code

(三)求平均值

 1、测试文件

1 17
2 36
3 60
4 49
5 14
6 51
7 27
8 36
9 45
10 27
View Code

2、分析

这是一个关于年龄统计的文件,求出所有年龄的平均值。

  • 通过map取出关注的字段年龄的值
  • 通过reduce计算所有年龄的总和
  • 通过count计算数量
  • 计算平均年龄

3、实现

from pyspark import SparkConf,SparkContext
import sys
def getsc():
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    return sc

def avgAge():
    sc = getsc()
    ageData =  sc.textFile(sys.argv[1]).map(lambda x:x.split(" ")[1]) 
    print(ageData.collect()) ##['17', '36', '60', '49', '14', '51', '27', '36', '45', '27']
    totalAge =ageData.\
        map(lambda x:int(x)).\
        reduce(lambda a,b:a+b)
    numAge = ageData.count()
    avgAge = totalAge/numAge
    print(totalAge) # 362
    print(numAge) #10
    sc.stop()

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print("Usage:wordcount <input>",file=sys.stderr)
        sys.exit(-1)
    avgAge()

 

posted @ 2020-04-06 16:05  iveBoy  阅读(220)  评论(0)    收藏  举报
TOP