聊斋志异
`import jieba
from collections import Counter
import re
人物名称合并规则(可根据实际情况补充)
name_mapping = {
"宁采臣": ["宁生", "宁公子", "宁郎"],
"聂小倩": ["小倩", "聂女", "倩娘"],
"燕赤霞": ["燕生", "燕道士", "燕侠"],
"婴宁": ["婴娘", "宁姑"],
"王子服": ["王生", "王郎"],
"莲香": ["莲姐", "莲娘"],
"桑晓": ["桑生", "桑郎"],
"辛十四娘": ["十四娘", "辛女"],
# 可以继续添加其他人物名称映射
}
反向映射,便于查找
reverse_name_mapping = {}
for main_name, aliases in name_mapping.items():
for alias in aliases:
reverse_name_mapping[alias] = main_name
def normalize_name(word):
"""将别名转换为标准名称"""
return reverse_name_mapping.get(word, word)
def is_chinese_word(word):
"""检查是否为中文字符"""
return re.match(r'[1]+$', word) is not None
def analyze_liaozhai_top_words(file_path, top_n=20):
# 读取文本文件
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
# 使用jieba分词
words = jieba.lcut(text)
# 过滤非中文词汇和单字词
filtered_words = [
normalize_name(word) for word in words
if is_chinese_word(word) and len(word) > 1
]
# 统计词频
word_counts = Counter(filtered_words)
# 获取前top_n个高频词
top_words = word_counts.most_common(top_n)
return top_words
使用示例
if name == "main":
# 假设聊斋文本文件名为"liaozhai.txt"
file_path = "liaozhai.txt"
top_words = analyze_liaozhai_top_words(file_path)
print("聊斋高频词汇Top 20:")
for i, (word, count) in enumerate(top_words, 1):
print(f"{i}. {word}: {count}次")`
\u4e00-\u9fa5 ↩︎
浙公网安备 33010602011771号