红楼梦分词

import jieba
from collections import Counter

def analyze_hongloumeng():
# 假设红楼梦全文文件名为 'hongloumeng.txt',与脚本在同一目录下
file_path = r"C:\Users\xxx\Downloads\《红楼梦》【爱上阅读_www.isyd.net】.txt"
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
except FileNotFoundError:
print(f"未找到文件 {file_path},请检查文件路径。")
return

alias_map = {
    "寶玉": "賈寶玉", "宝玉": "贾宝玉",
    "賈寶玉": "贾宝玉",
    "寶二爺": "贾宝玉", "宝二爷": "贾宝玉",
    "林妹妹": "林黛玉",
    "黛玉": "林黛玉",
    "林黛玉": "林黛玉",
    "鳳姐": "王熙凤", "凤姐": "王熙凤",
    "鳳姐兒": "王熙凤", "凤姐儿": "王熙凤",
    "王熙鳳": "王熙凤", "王熙凤": "王熙凤",
    "鳳辣子": "王熙凤", "凤辣子": "王熙凤",
    "寶釵": "薛宝钗", "宝钗": "薛宝钗",
    "薛寶釵": "薛宝钗", "薛宝钗": "薛宝钗",
    "襲人": "袭人", "袭人": "袭人",
    "賈母": "贾母", "贾母": "贾母",
    "老太太": "贾母",
    "賈政": "贾政", "贾政": "贾政",
    "王夫人": "王夫人",
    "賈璉": "贾琏", "贾琏": "贾琏",
    "平兒": "平儿", "平儿": "平儿",
    "湘雲": "史湘云", "湘云": "史湘云",
    "史湘雲": "史湘云", "史湘云": "史湘云",
    "劉姥姥": "刘姥姥", "刘姥姥": "刘姥姥",
    "晴雯": "晴雯",
    "紫鵑": "紫鹃", "紫鹃": "紫鹃",
    "鴛鴦": "鸳鸯", "鸳鸯": "鸳鸯",
    "薛蟠": "薛蟠",
    "香菱": "香菱",
    "賈雨村": "贾雨村", "雨村": "贾雨村",
    "甄士隱": "甄士隐", "士隐": "甄士隐"
}

stop_words = {
    '的', '了', '我', '你', '他', '她', '也', '说', '是', '不', '在', '有',
    '就', '这', '那', '一', '个', '人', '里', '见', '道', '便', '都', '来',
    '去', '上', '大', '又', '把', '得', '着', '与', '和', '地', '之', '儿',
    '中', '自', '己', '只', '将', '还', '知道', '如今', '两个', '那里',
    '什么', '没有', '不是', '这样', '只见', '听见', '说道', '出来', '这个',
    '一个', '我们', '你们', '他们', '姑娘', '这里', '奶奶', '太太', '老爷',
    '只得', '一声', '原来', '不敢', '今日', '二人', '一面', '不知',
    '起来', '回来', '进去', '如此', '怎么', '大家', '这些', '他们',
    '自己', '心里', '云云', '看官', '弟子', '不知', '因此'
}

words = jieba.cut(text)

word_counts = Counter()

for word in words:
    word = word.strip()
    if len(word) > 1 and word not in stop_words:
        normalized_word = alias_map.get(word, word)
        word_counts[normalized_word] += 1

top_20 = word_counts.most_common(20)

for word, count in top_20:
    print(f"{word}: {count}")

if name == "main":
analyze_hongloumeng()

posted @ 2025-06-22 18:13  渔樵伴夜归客  阅读(11)  评论(0)    收藏  举报