点击查看代码
import jieba
from collections import Counter
# 人物别称映射表(可根据需要扩充)
name_mapping = {
"孙猴子": "孙悟空",
"齐天大圣": "孙悟空",
"美猴王": "孙悟空",
"弼马温": "孙悟空",
"唐僧": "唐三藏",
"玄奘": "唐三藏",
"御弟": "唐三藏",
"八戒": "猪八戒",
"呆子": "猪八戒",
"沙僧": "沙和尚",
"悟净": "沙和尚"
}
def process_text(file_path):
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
# 使用jieba分词
words = jieba.lcut(text)
# 处理人物别称
processed_words = []
for word in words:
# 如果是人物别称则替换为统一名称
processed_words.append(name_mapping.get(word, word))
# 过滤掉长度<2的词
filtered_words = [word for word in processed_words if len(word) >= 2]
# 统计词频
word_counts = Counter(filtered_words)
# 获取前20个高频词
top20 = word_counts.most_common(20)
return top20
# 使用示例
file_path = '西游记.txt' # 替换为实际文件路径
top_words = process_text(file_path)
# 打印结果
print("《西游记》高频词Top20(已合并人物别称):")
for i, (word, count) in enumerate(top_words, 1):
print(f"{i}. {word}: {count}次")