jieba分词作业,信计2班17曾向嵩

import jieba
from collections import Counter


name_mapping = {"宁采臣": ["宁生", "采臣"], "聂小倩": ["小倩", "聂女"]}

# 1. 读取《聊斋》文本(替换为实际文件路径)
with open("liaozhai.txt", "r", encoding="utf-8") as f:
    text = f.read()

# 2. 分词
words = jieba.lcut(text)

# 3. 合并人物别名
merged_words = []
for word in words:
    for formal_name, aliases in name_mapping.items():
        if word in aliases:
            merged_words.append(formal_name)
            break
    else:
        merged_words.append(word)

# 4. 过滤无意义词汇(可扩展停用词表)
stopwords = ["的", "了", "是", "在", "和", "也", "呀"]  
filtered_words = [word for word in merged_words if word not in stopwords and len(word) > 1]

# 5. 统计词频,取前20
word_count = Counter(filtered_words)
top20 = word_count.most_common(20)

# 6. 输出结果
for word, count in top20:
    print(f"{word}: {count}")

 

posted @ 2025-06-23 12:28  吃不胖的曾小明  阅读(7)  评论(0)    收藏  举报