jieba分词,红楼梦

import jieba
from collections import Counter
import re

加载红楼梦的文本

def load_text(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
return text

使用jieba进行分词,并筛选人名

def segment_and_count_names(text):
# 添加红楼梦常见人名到自定义词典,以提高分词的准确性
jieba.load_userdict('custom_dict.txt') # 假设有一个自定义词典文件
words = jieba.lcut(text)
# 使用正则表达式匹配中文人名(通常为2-4个字)
name_pattern = re.compile(r'[1]{2,4}$')
names = [word for word in words if name_pattern.match(word)]
return names

统计人名出现频率

def count_name_frequency(names):
return Counter(names)

主函数

def main():
file_path = 'hongloumeng.txt' # 红楼梦文本文件路径
text = load_text(file_path)
names = segment_and_count_names(text)
name_counter = count_name_frequency(names)
# 输出出现频率最高的20个人名
for name, count in name_counter.most_common(20):
print(f"{name}: {count}")

if name == 'main':
main()


  1. \u4e00-\u9fa5 ↩︎

posted @ 2025-06-22 14:56  magixx  阅读(12)  评论(0)    收藏  举报