python123:三国演义词频分析
文本获取:ThreeKingdoms.txt(三国演义.txt):https://python123.io/resources/pye/threekingdoms.txt
因为文本是复制到txt文档中的,第一次提示编码错误:'utf-8' codec can't decode byte 0xc8 in position 0: invalid continuation byte。
将文本打开另存为utf-8编码格式即可
#CalThreeKingdomSv1.py import jieba txt = open("threekingdoms.txt","r",encoding="UTF-8").read() #打开文件 words = jieba.lcut(txt) #分词 counts = {} #建空字典 for word in words: if len(word) == 1: continue else: counts[word] =counts.get(word,0) +1 #判断名字是否在字典中存在,存在则+1,否则为1 items = list(counts.items()) #列表化 items.sort(key=lambda x:x[1],reverse=True) #排序,默认从小到大,reverse 反序输出 for i in range(15): word,count = items[i] print("{0:<10}{1:>5}".format(word,count))
根据结果继续优化代码:
#CalThreeKingdomSv1.py import jieba txt = open("threekingdoms.txt","r",encoding="UTF-8").read() excludes ={"将军","却说","荆州","二人","不可","不能","如此"} words = jieba.lcut(txt) counts = {} for word in words: if len(word) == 1: continue elif word =="诸葛亮" or word =="孔明曰": rword ="孔明" elif word =="关公" or word =="云长": rword ="关羽" elif word =="玄德" or word =="玄德曰": rword ="刘备" elif word =="孟德" or word =="丞相": rword ="曹操" else: rword =word counts[rword] =counts.get(rword,0) +1 for word in excludes: del counts[word] items = list(counts.items()) items.sort(key=lambda x:x[1],reverse=True) for i in range(10): word,count = items[i] print("{0:<10}{1:>5}".format(word,count))