西游记相关的分词,出现次数最高的20个

点击查看代码
import jieba
from collections import Counter

# 人物别称映射表(可根据需要扩充)
name_mapping = {
    "孙猴子": "孙悟空",
    "齐天大圣": "孙悟空",
    "美猴王": "孙悟空",
    "弼马温": "孙悟空",
    "唐僧": "唐三藏",
    "玄奘": "唐三藏",
    "御弟": "唐三藏",
    "八戒": "猪八戒",
    "呆子": "猪八戒",
    "沙僧": "沙和尚",
    "悟净": "沙和尚"
}

def process_text(file_path):
    # 读取文件内容
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # 使用jieba分词
    words = jieba.lcut(text)
    
    # 处理人物别称
    processed_words = []
    for word in words:
        # 如果是人物别称则替换为统一名称
        processed_words.append(name_mapping.get(word, word))
    
    # 过滤掉长度<2的词
    filtered_words = [word for word in processed_words if len(word) >= 2]
    
    # 统计词频
    word_counts = Counter(filtered_words)
    
    # 获取前20个高频词
    top20 = word_counts.most_common(20)
    
    return top20

# 使用示例
file_path = '西游记.txt'  # 替换为实际文件路径
top_words = process_text(file_path)

# 打印结果
print("《西游记》高频词Top20(已合并人物别称):")
for i, (word, count) in enumerate(top_words, 1):
    print(f"{i}. {word}: {count}次")
posted @ 2025-06-21 16:16  Lay“  阅读(11)  评论(0)    收藏  举报