红楼梦前20词频统计

查看代码

import jieba
from collections import Counter

text = open ("C:\\Users\\黄楚玉\\Desktop\\杂七杂八\\hongloumeng.txt", "r", encoding='utf-8').read() 
     

personName = {
    '宝玉':['宝玉', '贾宝玉', '宝二爷'], 
    '黛玉':['黛玉', '林黛玉', '林妹妹', '颦儿'],
    '宝钗':['宝钗', '薛宝钗', '宝姐姐'],
    '王熙凤':['王熙凤', '凤姐', '凤辣子'],
    '贾母':['贾母', '老太太']
}

replace_dict = {}
for main_name, aliases in personName.items():
    for alias in aliases:
        replace_dict[alias] = main_name
words = jieba.lcut(text)

filtered_words = []
stop_words = ['的', '是', '了', '和', '我', '你', '他', '这', '那', '着', '也', '都', '要', '就', '不', '人', '有', '说', '道', '笑']
for word in words:
    if len(word) > 1 and word not in stop_words and not word.isspace():
        filtered_words.append(replace_dict.get(word,word))

word_counts = Counter(filtered_words) #统计词频

top20 = word_counts.most_common(20)

print("红楼梦词频统计(前20):")
for word, count in top20:
    print("{},{}次".format(word,count))
posted @ 2025-06-21 17:31  与尔5  阅读(7)  评论(0)    收藏  举报