jieba 分词

import jieba
from collections import Counter

人物别名映射:统一不同称呼为标准名称,满足题目合并要求

alias_map = {
"小倩": "聂小倩",
"宁生": "宁采臣",
"燕生": "燕赤霞",
"生": "书生",
"士子": "书生",
"狐": "狐仙",
"狐女": "狐仙",
"狐妖": "狐仙",
"鬼": "女鬼",
"女郎": "女子",
"婢": "丫鬟",
"翁": "老翁",
"妪": "老妇",
"母": "老妇",
"郎君": "男子",
"客": "路人"
}

1. 读取聊斋文本

try:
with open("liaozhai.txt", "r", encoding="utf-8") as f:
content = f.read()
except FileNotFoundError:
print("错误:当前目录未找到liaozhai.txt,请把小说文本放入该文件!")
exit()

2. jieba精确模式分词

all_words = jieba.lcut(content)

3. 清洗分词 + 别名统一替换

valid_words = []
for word in all_words:
word = word.strip()
# 过滤单字、空字符(无意义虚词)
if len(word) <= 1:
continue
# 别名替换,不存在则保留原词
standard_name = alias_map.get(word, word)
valid_words.append(standard_name)

4. 统计词频,取出现最多前20

word_frequency = Counter(valid_words)
top20_list = word_frequency.most_common(20)

5. 格式化打印结果

print("《聊斋志异》词频TOP20(别称已合并)")
print(f"{'排名':<4}{'词汇':<8}{'出现次数':<6}")
print("-" * 22)
for rank, (word, count) in enumerate(top20_list, start=1):
print(f"{rank:<4}{word:<8}{count:<6}")

posted @ 2026-06-18 19:07  pyy_0630  阅读(2)  评论(0)    收藏  举报