聊斋志异

import jieba
from collections import Counter
import re

首先需要准备聊斋志异的文本文件(假设为liaozhai.txt)

这里我们定义一个函数来读取文件内容

def read_text(file_path):
with open("d:聊斋志异.txt", 'r', encoding='utf-8') as f:
text = f.read()
return text

人物名称合并规则(示例,可根据实际情况补充)

name_mapping = {
'宁采臣': ['宁生', '宁公子', '采臣'],
'聂小倩': ['小倩', '聂女', '倩娘'],
'燕赤霞': ['燕生', '燕道士', '赤霞'],
'婴宁': ['婴娘', '宁女'],
'辛十四娘': ['十四娘', '辛女'],
# 可以继续添加其他人物及其别称
}

反转映射,便于查找

reverse_mapping = {}
for main_name, aliases in name_mapping.items():
for alias in aliases:
reverse_mapping[alias] = main_name

分词并统计

def analyze_text(text):
# 使用jieba分词
words = jieba.lcut(text)

# 统计词频
word_counts = Counter(words)

# 合并人物名称
merged_counts = {}
for word, count in word_counts.items():
    # 如果是人物的别名,则合并到主名称
    if word in reverse_mapping:
        main_name = reverse_mapping[word]
        merged_counts[main_name] = merged_counts.get(main_name, 0) + count
    # 如果是主名称,直接统计
    elif word in name_mapping:
        merged_counts[word] = merged_counts.get(word, 0) + count
    # 其他词保持不变
    else:
        merged_counts[word] = merged_counts.get(word, 0) + count

return merged_counts

读取文本

text = read_text('liaozhai.txt')

分析文本

word_counts = analyze_text(text)

获取前20个最常见的词

top_20 = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:20]

打印结果

print("聊斋志异中出现次数前20的词汇(合并人物别名):")
for word, count in top_20:
print(f"{word}: {count}次")

posted @ 2025-06-21 20:32  he0608  阅读(19)  评论(0)    收藏  举报