Jieba分词
import jieba
from collections import Counter
import re
读取《西游记》文本(假设文件名为'西游记.txt')
try:
with open('西游记.txt', 'r', encoding='utf-8') as f:
content = f.read()
except FileNotFoundError:
print("未找到西游记文本文件,请确保文件名为'西游记.txt'并与代码在同一目录下")
exit()
文本预处理:去除特殊字符和换行符
content = re.sub(r'[^\u4e00-\u9fa5]', '', content)
使用jieba进行分词
words = jieba.lcut(content)
过滤单字和空白字符
filtered_words = [word for word in words if len(word) >= 2 and word.strip()]
统计词频
word_counts = Counter(filtered_words)
获取出现次数最高的20个词
top_20_words = word_counts.most_common(20)
输出结果
print("《西游记》中出现次数最高的20个词:")
for word, count in top_20_words:
print(f"{word}: {count}次")
浙公网安备 33010602011771号