python123:三国演义词频分析
文本获取:ThreeKingdoms.txt(三国演义.txt):https://python123.io/resources/pye/threekingdoms.txt
因为文本是复制到txt文档中的,第一次提示编码错误:'utf-8' codec can't decode byte 0xc8 in position 0: invalid continuation byte。
将文本打开另存为utf-8编码格式即可

#CalThreeKingdomSv1.py
import jieba
txt = open("threekingdoms.txt","r",encoding="UTF-8").read() #打开文件
words = jieba.lcut(txt) #分词
counts = {} #建空字典
for word in words:
if len(word) == 1:
continue
else:
counts[word] =counts.get(word,0) +1 #判断名字是否在字典中存在,存在则+1,否则为1
items = list(counts.items()) #列表化
items.sort(key=lambda x:x[1],reverse=True) #排序,默认从小到大,reverse 反序输出
for i in range(15):
word,count = items[i]
print("{0:<10}{1:>5}".format(word,count))
根据结果继续优化代码:
#CalThreeKingdomSv1.py
import jieba
txt = open("threekingdoms.txt","r",encoding="UTF-8").read()
excludes ={"将军","却说","荆州","二人","不可","不能","如此"}
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word =="诸葛亮" or word =="孔明曰":
rword ="孔明"
elif word =="关公" or word =="云长":
rword ="关羽"
elif word =="玄德" or word =="玄德曰":
rword ="刘备"
elif word =="孟德" or word =="丞相":
rword ="曹操"
else:
rword =word
counts[rword] =counts.get(rword,0) +1
for word in excludes:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count = items[i]
print("{0:<10}{1:>5}".format(word,count))

浙公网安备 33010602011771号