红楼梦的jieba分词
import jieba
from collections import Counter
读取《红楼梦》文本文件
def read_file(filename):
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
return text
使用jieba进行分词并统计词频
def word_frequency(text, top_n=20):
# 使用jieba进行精确模式分词
words = jieba.lcut(text)
# 过滤掉单字和标点符号(可根据需要调整)
filtered_words = [word for word in words if len(word) > 1 and not word.isspace()]
# 统计词频
word_counts = Counter(filtered_words)
# 获取出现频率最高的top_n个词
top_words = word_counts.most_common(top_n)
return top_words
主函数
def main():
# 设置红楼梦文本文件路径(请替换为实际路径)
filename = 'hongloumeng.txt'
try:
# 读取文本
text = read_file(filename)
# 添加红楼梦专有名词到jieba词典(可选)
jieba.add_word('贾宝玉')
jieba.add_word('林黛玉')
jieba.add_word('薛宝钗')
jieba.add_word('王熙凤')
jieba.add_word('贾母')
# 获取高频词
top_words = word_frequency(text)
# 打印结果
print("《红楼梦》中出现频率最高的20个词:")
for i, (word, count) in enumerate(top_words, 1):
print(f"{i}. {word}: {count}次")
except FileNotFoundError:
print(f"错误:找不到文件 {filename}")
except Exception as e:
print(f"发生错误: {str(e)}")
if name == 'main':
main()