聊斋jieba分词

import jieba
from collections import Counter

====================== 1. 人物别名映射(核心:合并同一人物) ======================

手动整理《聊斋志异》中人物的常见别名,可根据文本补充

liaozhai_alias_map = {
"宁采臣": ["宁生", "采臣"],
"聂小倩": ["小倩", "聂氏"],
"燕赤霞": ["燕生", "赤霞"],
# 可继续补充其他人物映射,比如:
# "连城": ["连氏", "连城姑娘"],
}

构建反向映射(方便替换别名)

alias_to_name = {}
for real_name, aliases in liaozhai_alias_map.items():
for alias in aliases:
alias_to_name[alias] = real_name

====================== 2. 加载文本 & 分词 ======================

def read_liaozhai_text():
"""
读取《聊斋志异》文本(需提前准备文本文件,或替换为在线读取)
文本获取:可从 https://ctext.org/zhs 等古籍网站下载
"""
with open("liaozhai.txt", "r", encoding="utf-8") as f: # 替换为你的文本路径
return f.read()

def分词并替换别名(text):
"""分词 + 替换别名(合并同一人物)"""
# 精确模式分词
words = jieba.lcut(text)

# 替换别名
replaced_words = []
for word in words:
    # 如果是人物别名,替换为真实姓名;否则保留原词
    replaced_words.append(alias_to_name.get(word, word))

return replaced_words

====================== 3. 统计词频 & 输出结果 ======================

def get_top20_words(words):
"""统计词频,返回出现次数最高的20个词"""
# 过滤掉无意义的助词(可根据需求调整)
stopwords = {"的", "了", "是", "在", "也", "呀", "呢", "啊"}
filtered_words = [word for word in words if word not in stopwords]

# 统计词频
word_count = Counter(filtered_words)

# 返回前20个
return word_count.most_common(20)

====================== 4. 主流程 ======================

if name == "main":
# 1. 加载文本
text = read_liaozhai_text()

# 2. 分词 + 替换别名
words =分词并替换别名(text)

# 3. 统计词频
top20 = get_top20_words(words)

# 4. 输出结果
print("《聊斋志异》分词统计(合并人物别名后):")
for idx, (word, count) in enumerate(top20, start=1):
    print(f"{idx}. {word}\t出现次数:{count}")
posted @ 2025-06-20 20:18  qywywq  阅读(14)  评论(0)    收藏  举报