聊斋志异
import jieba
from collections import Counter
import re
首先需要准备聊斋志异的文本文件(假设为liaozhai.txt)
这里我们定义一个函数来读取文件内容
def read_text(file_path):
with open("d:聊斋志异.txt", 'r', encoding='utf-8') as f:
text = f.read()
return text
人物名称合并规则(示例,可根据实际情况补充)
name_mapping = {
'宁采臣': ['宁生', '宁公子', '采臣'],
'聂小倩': ['小倩', '聂女', '倩娘'],
'燕赤霞': ['燕生', '燕道士', '赤霞'],
'婴宁': ['婴娘', '宁女'],
'辛十四娘': ['十四娘', '辛女'],
# 可以继续添加其他人物及其别称
}
反转映射,便于查找
reverse_mapping = {}
for main_name, aliases in name_mapping.items():
for alias in aliases:
reverse_mapping[alias] = main_name
分词并统计
def analyze_text(text):
# 使用jieba分词
words = jieba.lcut(text)
# 统计词频
word_counts = Counter(words)
# 合并人物名称
merged_counts = {}
for word, count in word_counts.items():
# 如果是人物的别名,则合并到主名称
if word in reverse_mapping:
main_name = reverse_mapping[word]
merged_counts[main_name] = merged_counts.get(main_name, 0) + count
# 如果是主名称,直接统计
elif word in name_mapping:
merged_counts[word] = merged_counts.get(word, 0) + count
# 其他词保持不变
else:
merged_counts[word] = merged_counts.get(word, 0) + count
return merged_counts
读取文本
text = read_text('liaozhai.txt')
分析文本
word_counts = analyze_text(text)
获取前20个最常见的词
top_20 = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:20]
打印结果
print("聊斋志异中出现次数前20的词汇(合并人物别名):")
for word, count in top_20:
print(f"{word}: {count}次")
浙公网安备 33010602011771号