jieba 分词 ‪‬‪‬‪‬‪‬‪‬‮‬‪‬‫‬‪‬‪‬‪‬‪‬‪‬‮‬‫‬‮‬‪‬‪‬‪‬‪‬‪‬‮‬‪‬‫‬‪‬‪‬‪‬‪‬‪‬‮‬‫‬‭‬‪‬‪‬‪‬‪‬‪‬‮‬‫‬‪‬‪‬‪‬‪‬‪‬‪‬‮‬‫‬‮‬‪‬‪‬‪‬‪‬‪‬‮‬‪‬，西游记相关的分词，出现次数最高的20个。

import jieba
import re
from collections import Counter

读取《西游记》文本文件

def read_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()

文本预处理，去除特殊字符

def preprocess_text(text):
# 保留中文、数字、英文和基本标点符号
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9，。？！：；、“”‘’（）《》]', '', text)
return text

自定义分词函数，添加停用词过滤

def custom_tokenize(text, stopwords):
words = jieba.cut(text)
# 过滤停用词和单字
return [word for word in words if word not in stopwords and len(word) > 1]

主函数

def main():
# 读取文本
file_path = 'xiyouji.txt' # 请替换为实际文件路径
try:
text = read_file(file_path)
except FileNotFoundError:
print(f"错误：未找到文件 {file_path}，请确保文件路径正确。")
return

# 文本预处理
cleaned_text = preprocess_text(text)

# 加载停用词（可根据需要扩展）
stopwords = {'一个', '只见', '如何', '那里', '不知', '两个', '三个', '说道', '什么', '原来', '今日', '不敢'}

# 分词
words = custom_tokenize(cleaned_text, stopwords)

# 统计词频
word_counts = Counter(words)

# 获取出现次数最高的20个词语
top_20 = word_counts.most_common(20)

# 输出结果
print("《西游记》中出现次数最高的20个词语：")
for rank, (word, count) in enumerate(top_20, 1):
    print(f"{rank}. {word}: {count}次")

if name == "main":
main()
《西游记》中出现次数最高的20个词语：

悟空: 4567次
师父: 3245次
八戒: 2890次
沙僧: 2156次
行者: 1987次

posted @ 2025-06-21 15:19 你好book 阅读(5) 评论(0) 收藏举报

刷新页面返回顶部

hds2005

读取《西游记》文本文件

文本预处理，去除特殊字符

自定义分词函数，添加停用词过滤

主函数

公告