import jieba
from collections import Counter
name_mapping = {"宁采臣": ["宁生", "采臣"], "聂小倩": ["小倩", "聂女"]}
# 1. 读取《聊斋》文本(替换为实际文件路径)
with open("liaozhai.txt", "r", encoding="utf-8") as f:
text = f.read()
# 2. 分词
words = jieba.lcut(text)
# 3. 合并人物别名
merged_words = []
for word in words:
for formal_name, aliases in name_mapping.items():
if word in aliases:
merged_words.append(formal_name)
break
else:
merged_words.append(word)
# 4. 过滤无意义词汇(可扩展停用词表)
stopwords = ["的", "了", "是", "在", "和", "也", "呀"]
filtered_words = [word for word in merged_words if word not in stopwords and len(word) > 1]
# 5. 统计词频,取前20
word_count = Counter(filtered_words)
top20 = word_count.most_common(20)
# 6. 输出结果
for word, count in top20:
print(f"{word}: {count}")