西游记
import jieba
from collections import Counter
import matplotlib.pyplot as plt
1. 读取本地《西游记》文本文件(UTF-8编码)
with open("西游记.txt", "r", encoding="utf-8") as f:
text = f.read()
2. 加载自定义词典(可选,提高人名识别精度)
自定义词典文件 custom_dict.txt 格式:每行一个词,例如 "孙悟空 nr"
jieba.load_userdict("custom_dict.txt")
3. 定义需要统计的人名和关键词(根据实际情况调整)
name_keywords = {
"悟空", "孙悟空", "行者", "齐天大圣", # 孙悟空的别称
"八戒", "猪八戒", "悟能", # 猪八戒的别称
"唐僧", "三藏", "玄奘", # 唐僧的别称
"沙僧", "悟净", "沙和尚", # 沙僧的别称
"观音", "菩萨", "观世音", # 观音菩萨
"如来", "佛祖", # 如来佛
"玉帝", "玉皇大帝", # 玉皇大帝
"龙王", "东海龙王", # 龙王
"妖怪", "妖精", "白骨精", "牛魔王" # 常见妖怪
}
4. 分词并过滤
words = jieba.lcut(text)
filtered_words = [
word for word in words
if word in name_keywords or len(word) >= 2 # 保留2字及以上词汇或指定人名
]
5. 统计词频
word_counts = Counter(filtered_words)
top_20 = word_counts.most_common(20)
6. 打印结果
print("《西游记》高频人物TOP20:")
for word, count in top_20:
print(f"{word}: {count}次")
7. 可视化(可选)
plt.figure(figsize=(12, 6))
names, counts = zip(*top_20)
plt.bar(names, counts, color="orange")
plt.title("《西游记》人物出现频率TOP20", fontsize=14)
plt.xticks(rotation=45, ha="right") # 旋转x轴标签
plt.tight_layout() # 自动调整布局
plt.show()

浙公网安备 33010602011771号