python分析三国演义中出现次数最多的词作词频统计

 1 import jieba
 2 """分析三国演义小说中名字出现次数最多的人物然后输出"""
 3 #第一步 读取小说内容
 4 fb=open('三国演义.txt','r',encoding='utf-8')# 'r'表示操作read
 5 content=fb.read()
 6 fb.close()
 7 excludes = {"将军", "却说", "荆州", "二人", "不可", "不能", "如此", "商议", "如何", "主公",
 8 
 9             "军士", "左右", "军马", "引兵", "次日", "大喜", "天下", "东吴", "于是", "今日",
10 
11             "不敢", "魏兵","人马", "陛下", "一人", "不知", "汉中", "只见", "众将","蜀兵","丞相"}  #排除
12 print('--------------------------------------------------------------------')
13 #第二步 分词
14 words=jieba._lcut(content)
15 #3.统计 容器 数据{单词:次数}
16 data={}#定义空字典
17 for word in words:
18     if(len(word)==1):#去除符号和单字
19         continue
20     elif word=='孔明曰'or word=='诸葛亮':
21         rename='孔明'
22     elif word=='玄德曰'or word=='玄德':
23         rename='刘备'
24     elif word=='云长'or word=='关公':
25         rename='关羽'
26     else:
27         rename=word
28     data[rename] = data.get(rename, 0) + 1
29 
30 #2.去除干扰词汇
31 for word in excludes:
32    del(data[word])
33 
34 
35 #排序
36 list=list(data.items())#转成列表以便排序
37 list.sort(key=lambda x:x[1],reverse=True)
38 fo=open('result.txt','w',encoding='utf-8')
39 
40 for i in range(10):#返回序列
41     print('{:<10}{:>5}'.format(list[i][0],list[i][1]))#输出到控制台
42     fo.write('{:<10}{:>5}{}'.format(list[i][0],list[i][1],'\n'))#写入到文件
43 fo.close()

 

运行结果:

 

 

posted on 2020-02-11 20:44  somethingα  阅读(2967)  评论(0编辑  收藏  举报