jieba 分词

import jieba
from collections import defaultdict, Counter

alias_map = {
"孙悟空": ["孙悟空", "孙猴子", "大圣", "齐天大圣", "美猴王", "孙行者", "行者"],
"唐僧": ["唐僧", "唐三藏", "玄奘", "金蝉子", "师父"],
"猪八戒": ["猪八戒", "猪悟能", "八戒", "天蓬元帅"],
"沙僧": ["沙僧", "沙和尚", "沙悟净", "卷帘大将"],
"白龙马": ["白龙马", "玉龙三太子"],
"如来佛祖": ["如来", "如来佛祖", "释迦牟尼"],
"观音菩萨": ["观音", "观音菩萨", "观世音"],
"玉皇大帝": ["玉帝", "玉皇大帝"],
"王母娘娘": ["王母", "王母娘娘"],
"太上老君": ["老君", "太上老君"],
"牛魔王": ["牛魔王", "牛大王"],
"铁扇公主": ["铁扇公主", "罗刹女"],
"红孩儿": ["红孩儿", "圣婴大王"],
"白骨精": ["白骨精", "尸魔"],
"二郎神": ["二郎神", "杨戬", "二郎真君"],
"哪吒": ["哪吒", "三太子"],
"李靖": ["李靖", "托塔天王"],
"嫦娥": ["嫦娥", "月宫仙子"],
"龙王": ["龙王", "老龙王"],
"太白金星": ["太白金星", "太白"]
}

name_standard = {}
for standard_name, aliases in alias_map.items():
for name in aliases:
name_standard[name] = standard_name

with open("西游记.txt", "r", encoding="utf-8") as f:
content = f.read()

words = jieba.lcut(content)
person_count = defaultdict(int)

for word in words:
if word in name_standard:
std_name = name_standard[word]
person_count[std_name] += 1

sorted_person = sorted(person_count.items(), key=lambda x: x[1], reverse=True)
top20 = sorted_person[:20]

print("西游记人物出现频次TOP20(已合并别名):")
print("排名\t人物\t出现次数")
for idx, (name, cnt) in enumerate(top20, 1):
print(f"{idx}\t{name}\t{cnt}")

posted @ 2026-07-03 14:32  Lin03  阅读(1)  评论(0)    收藏  举报