实例9 英文词频统计

实例

# 英文噪音(大小写、标点)处理,得到归一化的结果
def get_txt():
    txt = open("D:\\hamlet.txt", "r").read()
    txt = txt.lower()  # 单词都统一为小写
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
        txt = txt.replace(ch, " ")
    return txt


# 进行词频统计
t = get_txt()
dic_counts = {}  # 用于存放统计结果的字典
for word in t.split():
    dic_counts[word] = dic_counts.get(word, 0) + 1
items = list(dic_counts.items())
items.sort(key=lambda x: x[1], reverse=True)  # 依据每个元素中索引为1的值排序,从大到小
for i in range(10):
    word, nums = items[i]
    if word in " ":
        pass
    else:
        print("{0:<10}{1:>5}".format(word, nums))

 

posted @ 2021-08-06 13:03  seaidler  阅读(1)  评论(0)    收藏  举报