聊斋志异

`import jieba
from collections import Counter
import re

人物名称合并规则(可根据实际情况补充)

name_mapping = {
"宁采臣": ["宁生", "宁公子", "宁郎"],
"聂小倩": ["小倩", "聂女", "倩娘"],
"燕赤霞": ["燕生", "燕道士", "燕侠"],
"婴宁": ["婴娘", "宁姑"],
"王子服": ["王生", "王郎"],
"莲香": ["莲姐", "莲娘"],
"桑晓": ["桑生", "桑郎"],
"辛十四娘": ["十四娘", "辛女"],
# 可以继续添加其他人物名称映射
}

反向映射,便于查找

reverse_name_mapping = {}
for main_name, aliases in name_mapping.items():
for alias in aliases:
reverse_name_mapping[alias] = main_name

def normalize_name(word):
"""将别名转换为标准名称"""
return reverse_name_mapping.get(word, word)

def is_chinese_word(word):
"""检查是否为中文字符"""
return re.match(r'[1]+$', word) is not None

def analyze_liaozhai_top_words(file_path, top_n=20):
# 读取文本文件
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()

# 使用jieba分词
words = jieba.lcut(text)

# 过滤非中文词汇和单字词
filtered_words = [
    normalize_name(word) for word in words 
    if is_chinese_word(word) and len(word) > 1
]

# 统计词频
word_counts = Counter(filtered_words)

# 获取前top_n个高频词
top_words = word_counts.most_common(top_n)

return top_words

使用示例

if name == "main":
# 假设聊斋文本文件名为"liaozhai.txt"
file_path = "liaozhai.txt"
top_words = analyze_liaozhai_top_words(file_path)

print("聊斋高频词汇Top 20:")
for i, (word, count) in enumerate(top_words, 1):
    print(f"{i}. {word}: {count}次")`

  1. \u4e00-\u9fa5 ↩︎

posted @ 2025-06-29 21:20  飕飕  阅读(162)  评论(0)    收藏  举报