查看代码
import jieba
from collections import Counter
text = open ("C:\\Users\\黄楚玉\\Desktop\\杂七杂八\\hongloumeng.txt", "r", encoding='utf-8').read()
personName = {
'宝玉':['宝玉', '贾宝玉', '宝二爷'],
'黛玉':['黛玉', '林黛玉', '林妹妹', '颦儿'],
'宝钗':['宝钗', '薛宝钗', '宝姐姐'],
'王熙凤':['王熙凤', '凤姐', '凤辣子'],
'贾母':['贾母', '老太太']
}
replace_dict = {}
for main_name, aliases in personName.items():
for alias in aliases:
replace_dict[alias] = main_name
words = jieba.lcut(text)
filtered_words = []
stop_words = ['的', '是', '了', '和', '我', '你', '他', '这', '那', '着', '也', '都', '要', '就', '不', '人', '有', '说', '道', '笑']
for word in words:
if len(word) > 1 and word not in stop_words and not word.isspace():
filtered_words.append(replace_dict.get(word,word))
word_counts = Counter(filtered_words) #统计词频
top20 = word_counts.most_common(20)
print("红楼梦词频统计(前20):")
for word, count in top20:
print("{},{}次".format(word,count))