jieba分词

import jieba

# 读取聊斋文本,假设文件名为"liaozhai.txt",请根据实际情况修改路径和文件名
with open("liaozhai.txt", "r", encoding="utf - 8") as file:
    text = file.read()

# 同一人物不同说法的映射字典,可根据实际情况扩充
name_mapping = {
    "婴宁娘子": "婴宁",
    "聂小倩姑娘": "聂小倩"
}

# 进行分词
words = jieba.lcut(text)

# 统计词频
word_count = {}
for word in words:
    if word in name_mapping:
        word = name_mapping[word]
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1

# 去除单个字符的词(可根据需求调整)
filtered_word_count = {k: v for k, v in word_count.items() if len(k) > 1}

# 按照词频降序排序
sorted_word_count = sorted(filtered_word_count.items(), key=lambda item: item[1], reverse=True)

# 取出前20个高频词
top_20_words = sorted_word_count[:20]

# 输出结果
for word, count in top_20_words:
    print(f"词语:{word},出现次数:{count}")

 

posted @ 2025-06-23 12:29  叶柯鑫  阅读(4)  评论(0)    收藏  举报