08 分布式计算MapReduce--词频统计

import re
import collections

def count_word(path):
    result={}
    with open(path) as file_process:
        texts = file_process.read()
        # 小写
        texts = texts.lower()
        # 过滤标点
        texts = re.sub("\"|,|\.|!|\?"," ",texts)

        for word in texts.split():
            if word not in result:
                result[word]=0
            result[word] +=1
        return result


def sort_by_count(d):
    # 排序
    d = collections.OrderedDict(sorted(d.items(), key=lambda t: -t[1]))
    return d

# 文件名
file_name = "./english.txt"

dword = count_word(file_name)
dword = sort_by_count(dword)

# 输出
for key, value in dword.items():
    print(key, value, sep=":")
posted @ 2021-11-23 13:03  邹昀昊  阅读(98)  评论(0)    收藏  举报