jieba分词,红楼梦
import jieba
from collections import Counter
import re
加载红楼梦的文本
def load_text(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
return text
使用jieba进行分词,并筛选人名
def segment_and_count_names(text):
# 添加红楼梦常见人名到自定义词典,以提高分词的准确性
jieba.load_userdict('custom_dict.txt') # 假设有一个自定义词典文件
words = jieba.lcut(text)
# 使用正则表达式匹配中文人名(通常为2-4个字)
name_pattern = re.compile(r'[1]{2,4}$')
names = [word for word in words if name_pattern.match(word)]
return names
统计人名出现频率
def count_name_frequency(names):
return Counter(names)
主函数
def main():
file_path = 'hongloumeng.txt' # 红楼梦文本文件路径
text = load_text(file_path)
names = segment_and_count_names(text)
name_counter = count_name_frequency(names)
# 输出出现频率最高的20个人名
for name, count in name_counter.most_common(20):
print(f"{name}: {count}")
if name == 'main':
main()
\u4e00-\u9fa5 ↩︎

浙公网安备 33010602011771号