08 分布式计算MapReduce--词频统计
import re
import collections
def count_word(path):
result={}
with open(path) as file_process:
texts = file_process.read()
# 小写
texts = texts.lower()
# 过滤标点
texts = re.sub("\"|,|\.|!|\?"," ",texts)
for word in texts.split():
if word not in result:
result[word]=0
result[word] +=1
return result
def sort_by_count(d):
# 排序
d = collections.OrderedDict(sorted(d.items(), key=lambda t: -t[1]))
return d
# 文件名
file_name = "./english.txt"
dword = count_word(file_name)
dword = sort_by_count(dword)
# 输出
for key, value in dword.items():
print(key, value, sep=":")

浙公网安备 33010602011771号