08 分布式计算MapReduce--词频统计

import re

import collections

def count_word(path):

result={}

with open(path) as file_process:

texts = file_process.read()

# 小写

texts = texts.lower()

# 过滤标点

texts = re.sub("\"|,|\.|!|\?"," ",texts)

for word in texts.split():

if word not in result:

result[word]=0

result[word] +=1

return result

def sort_by_count(d):

# 排序

d = collections.OrderedDict(sorted(d.items(), key=lambda t: -t[1]))

return d

# 文件名

file_name = "./english.txt"

dword = count_word(file_name)

dword = sort_by_count(dword)

# 输出

for key, value in dword.items():

print(key, value, sep=":")

posted @ 2021-11-23 13:03 邹昀昊阅读(98) 评论(0) 收藏举报

刷新页面返回顶部

邹昀昊