Jieba分词

import jieba
from collections import Counter
import re

读取《西游记》文本(假设文件名为'西游记.txt')

try:
with open('西游记.txt', 'r', encoding='utf-8') as f:
content = f.read()
except FileNotFoundError:
print("未找到西游记文本文件,请确保文件名为'西游记.txt'并与代码在同一目录下")
exit()

文本预处理:去除特殊字符和换行符

content = re.sub(r'[^\u4e00-\u9fa5]', '', content)

使用jieba进行分词

words = jieba.lcut(content)

过滤单字和空白字符

filtered_words = [word for word in words if len(word) >= 2 and word.strip()]

统计词频

word_counts = Counter(filtered_words)

获取出现次数最高的20个词

top_20_words = word_counts.most_common(20)

输出结果

print("《西游记》中出现次数最高的20个词:")
for word, count in top_20_words:
print(f"{word}: {count}次")

posted @ 2025-06-23 13:40  kkkk0515  阅读(14)  评论(0)    收藏  举报