jieba分词，红楼梦

import jieba
from collections import Counter
import re

加载红楼梦的文本

def load_text(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
return text

使用jieba进行分词，并筛选人名

def segment_and_count_names(text):
# 添加红楼梦常见人名到自定义词典，以提高分词的准确性
jieba.load_userdict('custom_dict.txt') # 假设有一个自定义词典文件
words = jieba.lcut(text)
# 使用正则表达式匹配中文人名（通常为2-4个字）
name_pattern = re.compile(r'^[1]{2,4}$')
names = [word for word in words if name_pattern.match(word)]
return names

统计人名出现频率

def count_name_frequency(names):
return Counter(names)

主函数

def main():
file_path = 'hongloumeng.txt' # 红楼梦文本文件路径
text = load_text(file_path)
names = segment_and_count_names(text)
name_counter = count_name_frequency(names)
# 输出出现频率最高的20个人名
for name, count in name_counter.most_common(20):
print(f"{name}: {count}")

if name == 'main':
main()

\u4e00-\u9fa5 ↩︎

posted @ 2025-06-22 14:56 magixx 阅读(12) 评论(0) 收藏举报

刷新页面返回顶部

Melody0714

jieba分词，红楼梦

加载红楼梦的文本

使用jieba进行分词，并筛选人名

统计人名出现频率

主函数

公告