西游记分词

屏幕截图 2026-07-02 204313
`import jieba

1. 读取文本(用with语句自动关闭文件,更安全规范)

with open(r"d:\python test\VS code\西游记_第8回.txt", "r", encoding="utf-8") as f:
txt = f.read()

2. 无关词排除(仅保留本回高频非人物词)

excludes = {"却说", "二人", "不可", "不能", "如此", "说道", "只见", "一面", "什么",
"不是", "知道", "听见", "进来", "出去", "起来", "笑道", "问道", "答道",
"回头", "一时", "今日", "明日", "方才", "后来", "如今", "原来", "因此",
"只得", "怎么", "那里", "这个", "那个", "你们", "我们", "他们", "人家",
"东西", "事情", "说话", "出来", "进去", "大家", "自己",
"一个", "两个", "一半", "一会", "一下", "一点", "一些",
"一样", "一般", "不过", "只是", "就是", "还有", "连忙", "急忙", "忽然",
"果然", "其实", "本来", "可是", "但是", "虽然", "既然", "因为", "所以",
"如果", "那么", "只要", "只有", "无论", "都", "也", "还", "又", "再", "更",
"此间", "此处", "如何", "为何", "何为", "何故", "这般", "那厢", "这边",
"那怪", "怪物", "弟子", "贫僧", "师父", "徒弟", "正是", "果然", "真个"}

3. 分词

words = jieba.lcut(txt)
counts = {}

4. 统一人物别称(适配西游记第8回出场人物)

for word in words:
if len(word) == 1: # 排除单个字和标点
continue
# 核心人物别称映射
elif word == "如来" or word == "我佛" or word == "佛祖" or word == "世尊":
rword = "如来佛祖"
elif word == "观音" or word == "菩萨" or word == "观世音" or word == "观音尊者":
rword = "观音菩萨"
elif word == "木吒" or word == "惠岸" or word == "惠岸行者":
rword = "惠岸行者"
elif word == "卷帘大将" or word == "沙悟净":
rword = "沙悟净"
elif word == "天蓬元帅" or word == "猪悟能" or word == "悟能":
rword = "猪悟能"
elif word == "大圣" or word == "齐天大圣" or word == "孙悟空" or word == "妖猴" or word == "乖猴":
rword = "孙悟空"
elif word == "玉帝":
rword = "玉皇大帝"
elif word == "玉龙" or word == "小龙" or word == "孽龙":
rword = "西海小龙"
elif word == "金顶大仙":
rword = "金顶大仙"
# 无别称的次要人物
elif word in ["阿傩", "迦叶", "丘天师", "张天师", "西海龙王", "卵二姐"]:
rword = word
else:
rword = word

counts[rword] = counts.get(rword, 0) + 1

5. 删除无关词汇

for word in excludes:
if word in counts:
del counts[word]

6. 按词频降序排序

items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)

7. 输出前10名(增加边界判断,避免索引越界)

print("《西游记》第8回 人物出场词频统计(前10名):")
for i in range(min(10, len(items))):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))`

posted @ 2026-07-02 20:45  user6666666999999  阅读(3)  评论(0)    收藏  举报