jieba

import jieba
import re
from collections import Counter

1. 人物别名映射(合并同一人物的不同称呼)

synonym_map = {
"聂小倩": ["小倩", "聂女", "小倩女"],
"宁采臣": ["采臣", "宁生", "宁君"],
"狐仙": ["狐妖", "狐女", "狐", "狐狸精"],
"书生": ["生", "士子", "秀才", "士人"],
"鬼": ["鬼魂", "鬼魅", "鬼物", "阴魂"],
"道士": ["道人", "道者", "方士"],
"神": ["神灵", "神仙", "神君"],
"女子": ["女", "女郎", "女子", "女流"],
"母亲": ["母", "老母", "慈母"],
"父亲": ["父", "老父", "严君"],
"妻子": ["妻", "妇人", "内人"],
"朋友": ["友", "友人", "故人"],
"儿子": ["子", "儿", "孩儿"],
"女儿": ["女", "闺女", "小女"],
"丈夫": ["夫", "夫君", "男子"],
"县官": ["令", "县令", "官", "吏"],
"寺庙": ["寺", "庙", "祠", "庵"],
"夜晚": ["夜", "夕", "暮", "晚"],
"白天": ["日", "昼", "朝", "晨"],
"家中": ["家", "宅", "室", "庐"],
"山中": ["山", "岭", "峰", "谷"],
"门外": ["门", "户", "扉", "牖"]
}

构建反向映射(将所有别名指向标准名)

alias_to_standard = {}
for standard, aliases in synonym_map.items():
for alias in aliases:
alias_to_standard[alias] = standard

2. 读取聊斋志异文本(请替换为你的文件路径)

file_path = "聊斋志异.txt" # 确保文件和代码在同一文件夹,或写绝对路径
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()

3. 文本预处理(过滤非中文字符)

text = re.sub(r"[^\u4e00-\u9fa5]", "", text)

4. jieba分词

words = jieba.lcut(text)

5. 词频统计 + 合并同义词

filtered_words = []
for word in words:
if len(word) < 2: # 过滤单字(如标点、语气词)
continue
# 替换为标准名(如果是别名)
standard_word = alias_to_standard.get(word, word)
filtered_words.append(standard_word)

统计词频

word_counts = Counter(filtered_words)

6. 获取并输出出现次数最高的20个词

top20 = word_counts.most_common(20)
print("《聊斋志异》词频Top20(合并人物别名后):")
for i, (word, count) in enumerate(top20, 1):
print(f"{i}. {word}: {count}次")

posted @ 2026-06-12 12:34  7_loveee  阅读(1)  评论(0)    收藏  举报