python123:三国演义词频分析

文本获取:ThreeKingdoms.txt(三国演义.txt):https://python123.io/resources/pye/threekingdoms.txt

因为文本是复制到txt文档中的,第一次提示编码错误:'utf-8' codec can't decode byte 0xc8 in position 0: invalid continuation byte。

将文本打开另存为utf-8编码格式即可

#CalThreeKingdomSv1.py
import jieba
txt = open("threekingdoms.txt","r",encoding="UTF-8").read()  #打开文件
words = jieba.lcut(txt)                        #分词
counts = {}                               #建空字典
for word in words:
    if len(word) == 1:
        continue
    else:
        counts[word] =counts.get(word,0) +1            #判断名字是否在字典中存在,存在则+1,否则为1
items = list(counts.items())                     #列表化
items.sort(key=lambda x:x[1],reverse=True)            #排序,默认从小到大,reverse 反序输出
for i in range(15):
    word,count = items[i]
    print("{0:<10}{1:>5}".format(word,count))

  根据结果继续优化代码:

#CalThreeKingdomSv1.py
import jieba
txt = open("threekingdoms.txt","r",encoding="UTF-8").read()
excludes ={"将军","却说","荆州","二人","不可","不能","如此"}
words = jieba.lcut(txt)
counts = {}
for word in words:
    if len(word) == 1:
        continue
    elif word =="诸葛亮" or word =="孔明曰":
        rword ="孔明"
    elif word =="关公" or word =="云长":
        rword ="关羽"
    elif word =="玄德" or word =="玄德曰":
        rword ="刘备"
    elif word =="孟德" or word =="丞相":
        rword ="曹操"  
    else:
        rword =word
        counts[rword] =counts.get(rword,0) +1
for word in excludes:
    del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
    word,count = items[i]
    print("{0:<10}{1:>5}".format(word,count))

  

 

posted @ 2021-11-22 15:42  吉祥知知  阅读(683)  评论(0)    收藏  举报