红楼梦分词
import jieba
from collections import Counter
def analyze_hongloumeng():
# 假设红楼梦全文文件名为 'hongloumeng.txt',与脚本在同一目录下
file_path = r"C:\Users\xxx\Downloads\《红楼梦》【爱上阅读_www.isyd.net】.txt"
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
except FileNotFoundError:
print(f"未找到文件 {file_path},请检查文件路径。")
return
alias_map = {
"寶玉": "賈寶玉", "宝玉": "贾宝玉",
"賈寶玉": "贾宝玉",
"寶二爺": "贾宝玉", "宝二爷": "贾宝玉",
"林妹妹": "林黛玉",
"黛玉": "林黛玉",
"林黛玉": "林黛玉",
"鳳姐": "王熙凤", "凤姐": "王熙凤",
"鳳姐兒": "王熙凤", "凤姐儿": "王熙凤",
"王熙鳳": "王熙凤", "王熙凤": "王熙凤",
"鳳辣子": "王熙凤", "凤辣子": "王熙凤",
"寶釵": "薛宝钗", "宝钗": "薛宝钗",
"薛寶釵": "薛宝钗", "薛宝钗": "薛宝钗",
"襲人": "袭人", "袭人": "袭人",
"賈母": "贾母", "贾母": "贾母",
"老太太": "贾母",
"賈政": "贾政", "贾政": "贾政",
"王夫人": "王夫人",
"賈璉": "贾琏", "贾琏": "贾琏",
"平兒": "平儿", "平儿": "平儿",
"湘雲": "史湘云", "湘云": "史湘云",
"史湘雲": "史湘云", "史湘云": "史湘云",
"劉姥姥": "刘姥姥", "刘姥姥": "刘姥姥",
"晴雯": "晴雯",
"紫鵑": "紫鹃", "紫鹃": "紫鹃",
"鴛鴦": "鸳鸯", "鸳鸯": "鸳鸯",
"薛蟠": "薛蟠",
"香菱": "香菱",
"賈雨村": "贾雨村", "雨村": "贾雨村",
"甄士隱": "甄士隐", "士隐": "甄士隐"
}
stop_words = {
'的', '了', '我', '你', '他', '她', '也', '说', '是', '不', '在', '有',
'就', '这', '那', '一', '个', '人', '里', '见', '道', '便', '都', '来',
'去', '上', '大', '又', '把', '得', '着', '与', '和', '地', '之', '儿',
'中', '自', '己', '只', '将', '还', '知道', '如今', '两个', '那里',
'什么', '没有', '不是', '这样', '只见', '听见', '说道', '出来', '这个',
'一个', '我们', '你们', '他们', '姑娘', '这里', '奶奶', '太太', '老爷',
'只得', '一声', '原来', '不敢', '今日', '二人', '一面', '不知',
'起来', '回来', '进去', '如此', '怎么', '大家', '这些', '他们',
'自己', '心里', '云云', '看官', '弟子', '不知', '因此'
}
words = jieba.cut(text)
word_counts = Counter()
for word in words:
word = word.strip()
if len(word) > 1 and word not in stop_words:
normalized_word = alias_map.get(word, word)
word_counts[normalized_word] += 1
top_20 = word_counts.most_common(20)
for word, count in top_20:
print(f"{word}: {count}")
if name == "main":
analyze_hongloumeng()

浙公网安备 33010602011771号