jieba分词作业，信计2班17曾向嵩

import jieba
from collections import Counter


name_mapping = {"宁采臣": ["宁生", "采臣"], "聂小倩": ["小倩", "聂女"]}

# 1. 读取《聊斋》文本（替换为实际文件路径）
with open("liaozhai.txt", "r", encoding="utf-8") as f:
    text = f.read()

# 2. 分词
words = jieba.lcut(text)

# 3. 合并人物别名
merged_words = []
for word in words:
    for formal_name, aliases in name_mapping.items():
        if word in aliases:
            merged_words.append(formal_name)
            break
    else:
        merged_words.append(word)

# 4. 过滤无意义词汇（可扩展停用词表）
stopwords = ["的", "了", "是", "在", "和", "也", "呀"]  
filtered_words = [word for word in merged_words if word not in stopwords and len(word) > 1]

# 5. 统计词频，取前20
word_count = Counter(filtered_words)
top20 = word_count.most_common(20)

# 6. 输出结果
for word, count in top20:
    print(f"{word}: {count}")

posted @ 2025-06-23 12:28 吃不胖的曾小明阅读(7) 评论(0) 收藏举报

刷新页面返回顶部

terry01

jieba分词作业，信计2班17曾向嵩

公告