理解MapReduce计算构架
2018-05-11 21:57 216-陈文建 阅读(156) 评论(0) 收藏 举报用Python编写WordCount程序任务
|
程序 |
WordCount |
|
输入 |
一个包含大量单词的文本文件 |
|
输出 |
文件中每个单词及其出现次数(频数),并按照单词字母顺序排序,每个单词和其频数占一行,单词和频数之间有间隔 |
- 编写map函数,reduce函数
- 将其权限作出相应修改
- 本机上测试运行代码
- 放到HDFS上运行
- 下载并上传文件到hdfs上
- 用Hadoop Streaming命令提交任务
12345678910111213141516
create'Student',' S_No ','S_Name','S_Sex','S_Age'put'Student','s001','S_No','2015001'put'Student','s001','S_Name','Zhangsan'put'Student','s001','S_Sex','male'put'Student','s001','S_Age','23'put'Student','s002','S_No','2015003'put'Student','s002','S_Name','Mary'put'Student','s002','S_Sex','female'put'Student','s002','S_Age','22'put'Student','s003','S_No','2015003'put'Student','s003','S_Name','Lisi'put'Student','s003','S_Sex','male'put'Student','s003','S_Age','24'123456scan'Student'alter'Student','NAME'=>'course'put'Student','3','course:Math','85'dorp'Student','course'count's1'truncate's1'12345678910111213141516171819202122232425262728293031323334353637cd/home/hadoop/wcsudo gedit mapper.py# map函数importsysforiinstdin:i=i.strip()words=i.split()forwordinwords:print'%s\t%s'%(word,1)#reduce函数fromoperatorimportitemgetterimportsyscurrent_word=Nonecurrent_count=0word=Noneforiinstdin:i=i.strip()word, count=i.split('\t',1)try:count=int(count)exceptValueError:continueifcurrent_word==word:current_count+=countelse:ifcurrent_word:print'%s\t%s'%(current_word, current_count)current_count=countcurrent_word=wordifcurrent_word==word:print'%s\t%s'%(current_word, current_count)1chmod a+x/home/hadoop/mapper.py123echo"foo foo quux labs foo bar quux"|/home/hadoop/wc/mapper.pyecho"foo foo quux labs foo bar quux"|/home/hadoop/wc/mapper.py | sort-k1,1|/home/hadoop/wc/reducer.p1234567cd/home/hadoop/wcwget http://www.gutenberg.org/files/5000/5000-8.txtwget http://www.gutenberg.org/cache/epub/20417/pg20417.txtcd/usr/hadoop/wchdfs dfs-put/home/hadoop/hadoop/gutenberg/*.txt/user/hadoop/input
浙公网安备 33010602011771号