红楼梦的jieba分词

import jieba
from collections import Counter

读取《红楼梦》文本文件

def read_file(filename):
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
return text

使用jieba进行分词并统计词频

def word_frequency(text, top_n=20):
# 使用jieba进行精确模式分词
words = jieba.lcut(text)

# 过滤掉单字和标点符号(可根据需要调整)
filtered_words = [word for word in words if len(word) > 1 and not word.isspace()]

# 统计词频
word_counts = Counter(filtered_words)

# 获取出现频率最高的top_n个词
top_words = word_counts.most_common(top_n)

return top_words

主函数

def main():
# 设置红楼梦文本文件路径(请替换为实际路径)
filename = 'hongloumeng.txt'

try:
    # 读取文本
    text = read_file(filename)
    
    # 添加红楼梦专有名词到jieba词典(可选)
    jieba.add_word('贾宝玉')
    jieba.add_word('林黛玉')
    jieba.add_word('薛宝钗')
    jieba.add_word('王熙凤')
    jieba.add_word('贾母')
    
    # 获取高频词
    top_words = word_frequency(text)
    
    # 打印结果
    print("《红楼梦》中出现频率最高的20个词:")
    for i, (word, count) in enumerate(top_words, 1):
        print(f"{i}. {word}: {count}次")
        
except FileNotFoundError:
    print(f"错误:找不到文件 {filename}")
except Exception as e:
    print(f"发生错误: {str(e)}")

if name == 'main':
main()

posted @ 2025-06-21 15:36  HeDesongfuqin  阅读(8)  评论(0)    收藏  举报