聊斋志异

import jieba
import re
from collections import Counter
import chardet

1. 安全读取文件(自动处理编码和路径)

def read_file_safely(path):
try:
# 检测文件编码
with open(path, 'rb') as f:
raw_data = f.read(10000)
result = chardet.detect(raw_data)
encoding = result['encoding'] or 'gb18030'

    # 读取内容
    with open(path, 'r', encoding=encoding, errors='replace') as f:
        return f.read()
except UnicodeDecodeError:
    # 编码兜底:优先尝试中文古籍常用编码
    encodings = ['gb18030', 'gbk', 'utf-8']
    for enc in encodings:
        try:
            with open(path, 'r', encoding=enc, errors='replace') as f:
                return f.read()
        except:
            continue
    raise Exception("编码解析失败,请检查文件完整性")

file_path = r"C:\Users\minmin\PycharmProjects\PythonProject\test\聊斋志异.txt"
text = read_file_safely(file_path)

2. 文本预处理:仅保留中文及标点

text = re.sub(r"[^\u4e00-\u9fa5。,!?;:、\n]", "", text)

3. 定义同义词词典(合并同类词)

synonym_dict = {
"小倩": "聂小倩", "鬼妻": "聂小倩", "采臣": "宁采臣", # 聂小倩相关
"黑山": "黑山老妖", "万妖群魔之首": "黑山老妖", # 黑山老妖别名
"十四娘": "辛十四娘", "子楚": "孙子楚", # 其他人物
"赵阿宝": "阿宝", "孙猴子": "孙悟空", "孙行者": "孙悟空" # 通用别名
}

4. 加载停用词表

stop_words = set()

基础停用词(可根据需要扩展)

base_stops = {"不知", "不可", "不敢", "以为", "可以", "如此", "而已", "众人", "说道", "女子"}
stop_words.update(base_stops)

5. 分词 + 同义词合并 + 停用词过滤

def process_text(text):
words = jieba.lcut(text, cut_all=False) # 精确模式
processed = []
for word in words:
if len(word) == 1 or word in stop_words:
continue # 过滤单字和停用词
# 同义词替换:若在词典中则替换,否则保留原词
word = synonym_dict.get(word, word)
processed.append(word)
return processed

words_processed = process_text(text)

6. 统计词频并输出Top20

word_counts = Counter(words_processed)
top20 = word_counts.most_common(20)
print("《聊斋志异》高频词Top20(合并同类词后)😊
for word, count in top20:
print(f"{word}: {count}")

posted @ 2025-06-23 01:17  城阳  阅读(6)  评论(0)    收藏  举报