import jieba
# 读取聊斋文本,假设文件名为"liaozhai.txt",请根据实际情况修改路径和文件名
with open("liaozhai.txt", "r", encoding="utf - 8") as file:
text = file.read()
# 同一人物不同说法的映射字典,可根据实际情况扩充
name_mapping = {
"婴宁娘子": "婴宁",
"聂小倩姑娘": "聂小倩"
}
# 进行分词
words = jieba.lcut(text)
# 统计词频
word_count = {}
for word in words:
if word in name_mapping:
word = name_mapping[word]
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
# 去除单个字符的词(可根据需求调整)
filtered_word_count = {k: v for k, v in word_count.items() if len(k) > 1}
# 按照词频降序排序
sorted_word_count = sorted(filtered_word_count.items(), key=lambda item: item[1], reverse=True)
# 取出前20个高频词
top_20_words = sorted_word_count[:20]
# 输出结果
for word, count in top_20_words:
print(f"词语:{word},出现次数:{count}")