聊斋jieba
import jieba
from collections import defaultdict
import re
设置同人物不同称呼的映射表
name_mapping = {
"孙猴子": "孙悟空",
"齐天大圣": "孙悟空",
"美猴王": "孙悟空",
"狐女": "小翠",
"聂小倩": "小倩",
# 可以继续添加其他需要合并的称呼
}
读取《聊斋志异》文本
with open('聊斋.txt', 'r', encoding='utf-8') as f:
text = f.read()
使用jieba分词
words = jieba.lcut(text)
统计词频
word_count = defaultdict(int)
for word in words:
# 只统计2-4个字的名词,并过滤掉非人名
if 2 <= len(word) <= 4 and re.match('[1]+$', word):
# 统一人物名称
normalized_word = name_mapping.get(word, word)
word_count[normalized_word] += 1
获取出现频率最高的20个人物名称
top_20 = sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:20]
打印结果
print("《聊斋志异》中出现频率最高的20个人物名称:")
for i, (name, count) in enumerate(top_20, 1):
print(f"{i}. {name}: {count}次")
\u4e00-\u9fa5 ↩︎

浙公网安备 33010602011771号