import jieba
from collections import defaultdict

定义西游记人物的不同称呼映射

character_aliases = {
"孙悟空": ["悟空", "孙行者", "美猴王", "齐天大圣", "斗战胜佛"],
"唐僧": ["玄奘", "唐三藏", "唐玄奘", "金蝉子"],
"猪八戒": ["八戒", "猪悟能", "天蓬元帅", "猪刚鬣"],
"沙僧": ["沙悟净", "沙和尚", "卷帘大将", "金身罗汉"],
"唐僧师徒": ["孙悟空", "唐僧", "猪八戒", "沙僧"],
"观音菩萨": ["观音", "观世音", "观音大士"],
"如来佛祖": ["如来", "释迦牟尼"],
"玉皇大帝": ["玉帝", "玉皇", "天帝"],
"牛魔王": ["牛魔", "大力王"],
"白骨精": ["白骨夫人", "白骨老妖"]
}

反转映射,便于替换

alias_to_character = {}
for char, aliases in character_aliases.items():
for alias in aliases:
alias_to_character[alias] = char

自定义分词词典,将主要人物作为独立词

for char in character_aliases.keys():
jieba.add_word(char)

def process_xiyouji_text(text):
"""处理西游记文本,合并人物不同称呼并分词"""
# 第一步:替换所有别名到标准称呼
for alias, char in alias_to_character.items():
text = text.replace(alias, char)

# 第二步:分词处理
words = jieba.lcut(text)

# 第三步:统计人物出现次数
character_count = defaultdict(int)
for word in words:
    if word in character_aliases:
        character_count[word] += 1

return words, character_count

示例文本

sample_text = "悟空和唐僧去取经,路上遇到了白骨夫人,后来观音大士帮助了他们,唐三藏念紧箍咒让孙行者头疼"
words, count = process_xiyouji_text(sample_text)

输出结果

print("分词结果:", words)
print("人物出现统计:")
for char, cnt in count.items():
print(f"{char}: {cnt}次")