【复习】用hadoop各种组件实践wordCount

MapReduce

Map:

import sys
for line in sys.stdin:
    words = line.strip().split(' ')
for word in words:
    print "\t".join([word.strip(),'1'])

Reduce:

import sys
sum,tmp = 0,None
for line in sys.stdin:
    words = line.strip().split('\t')
    if len(words)!=2:
        continue
    word,cnt = words
    if not tmp:
        tmp = word
    if tmp!=word:
        print '\t'.join([tmp,str(sum)])
        tmp = word
        sum = 0
    sum += int(cnt)
print '\t'.join([tmp, str(sum)])

hadoopStream提交:

HADOOP_CMD="/usr/local/src/hadoop-2.6.5/bin/hadoop"
STREAM_JAR_PATH="/usr/local/src/hadoop-2.6.5/share/hadoop/tools/lib/hadoop-streaming-2.6.5.jar"

INPUT_FILE_PATH="/The_Man_of_Property.txt"
OUTPUT_PATH="/wordCount/mapreduce"

$HADOOP_CMD fs -rmr -skipTrash $OUTPUT_PATH

$HADOOP_CMD jar $STREAM_JAR_PATH \
    -input $INPUT_FILE_PATH \
    -output $OUTPUT_PATH \
    -mapper "python map.py" \
    -reducer "python reduce.py" \
    -file map.py \
    -file reduce.py

验证:

[root@master wordCount]# hadoop fs -text /wordCount/mapreduce/part-00000 | head
(Baynes 1
(Dartie 1
(Dartie’s       1
(Down-by-the-starn)     2
(Down-by-the-starn),    1
(He     1
(I      1
(James) 1
(L500)  1
(Louisa 1
text: Unable to write to output stream.

Spark

pyspark代码:

from __future__ import  print_function  
import sys
from operator import add
from  pyspark import  SparkContext,SparkConf    
reload(sys)
sys.setdefaultencoding('utf-8')

def f(x):
    return x.strip().split(' ')

if __name__ == "__main__":
        conf = SparkConf().setMaster("local").setAppName("spark_wordcount")
        sc = SparkContext(conf = conf)

        in_file = sc.textFile("/The_Man_of_Property.txt")

        # resultRdd = in_file.flatMap(f).map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b)
        # resultColl = resultRdd.collect()
        # for line in resultColl:
        #     print(line)

        resultRdd = in_file\
            .flatMap(f)\
            .map(lambda word:(word,1))\
            .reduceByKey(lambda a,b:a+b)\
            .map(lambda line:'\t'.join([line[0],str(line[1])]))\
            .saveAsTextFile("/wordCount/spark")

        sc.stop()

验证:

[root@master wordCount]# hadoop fs -text /wordCount/spark/part-00000 | head
—‘the   1
writings        1
considered,     1
shouted,        1
yellow  7
‘Humbug!’       1
four    30
aegis   1
considered;     1
text: Unable to write to output stream.

Hive

导入文章

create table wordcount_docs(line string);
LOAD DATA INPATH '/The_Man_of_Property.txt' OVERWRITE INTO TABLE wordcount_docs;

第一种方法:

create table words(word string);

 

insert into table words select explode(split(line, " ")) as word from wordcount_docs;

拆分单词 : hive的内置表生成函数(UDTF):explode => 将单列扩展成多行

  • explode(ARRAY) 列表中的每个元素生成一行
  • explode(MAP) map中每个key-value对,生成一行,key为一列,value为一列

验证:

hive> select * from words limit 2;
OK
Preface
“The
Time taken: 0.051 seconds, Fetched: 2 row(s)

split => hive字符串分割函数
split(str, regex) - Splits str around occurances that match regex
Time taken: 0.769 seconds, Fetched: 1 row(s)
返回值为一个数组

验证wordcount:

hive> select word, count(*) as cnt from words group by word order by cnt desc limit 2;
Total MapReduce CPU Time Spent: 9 seconds 470 msec
OK
the     5144
of      3407
Time taken: 73.257 seconds, Fetched: 10 row(s)

第二种方法(transform):

create table wordCount(word string,cnt int) row format delimited fields terminated by '\t';

 

hive> desc wordCount;
OK
word                    string                                      
cnt                     int                                         
Time taken: 0.078 seconds, Fetched: 2 row(s)

 

add file /home/master/test/wordCount/hive/map.py;

验证map

select transform(line) using "python map.py" as word,cnt from wordcount_docs limit 10;

 

add file /home/master/test/wordCount/hive/reduce.py;

map -> reduce 中间需要partition

验证reduce

select transform (M.word , M.cnt) using "python reduce.py" as w,c 
from 
(select transform(line) using "python map.py" as word,cnt 
from wordcount_docs cluster by word) M 
limit 10;

clusterby 默认升序

hive> insert overwrite table wordCount
    > select transform (M.word , M.cnt) using "python reduce.py" as w,c 
    > from
    > (select transform(line) using "python map.py" as word,cnt 
    > from wordcount_docs cluster by word) M ;

验证:

hive> select * from wordCount order by cnt desc limit 3;

Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 4.6 sec   HDFS Read: 187579 HDFS Write: 84 SUCCESS
Total MapReduce CPU Time Spent: 4 seconds 600 msec
OK
the     5144
of      3407
to      2782
Time taken: 32.815 seconds, Fetched: 10 row(s)
posted @ 2019-09-08 12:07  带了1个小才艺  阅读(172)  评论(0)    收藏  举报