聊斋志异
import jieba
import re
from collections import Counter
import chardet
1. 安全读取文件(自动处理编码和路径)
def read_file_safely(path):
try:
# 检测文件编码
with open(path, 'rb') as f:
raw_data = f.read(10000)
result = chardet.detect(raw_data)
encoding = result['encoding'] or 'gb18030'
# 读取内容
with open(path, 'r', encoding=encoding, errors='replace') as f:
return f.read()
except UnicodeDecodeError:
# 编码兜底:优先尝试中文古籍常用编码
encodings = ['gb18030', 'gbk', 'utf-8']
for enc in encodings:
try:
with open(path, 'r', encoding=enc, errors='replace') as f:
return f.read()
except:
continue
raise Exception("编码解析失败,请检查文件完整性")
file_path = r"C:\Users\minmin\PycharmProjects\PythonProject\test\聊斋志异.txt"
text = read_file_safely(file_path)
2. 文本预处理:仅保留中文及标点
text = re.sub(r"[^\u4e00-\u9fa5。,!?;:、\n]", "", text)
3. 定义同义词词典(合并同类词)
synonym_dict = {
"小倩": "聂小倩", "鬼妻": "聂小倩", "采臣": "宁采臣", # 聂小倩相关
"黑山": "黑山老妖", "万妖群魔之首": "黑山老妖", # 黑山老妖别名
"十四娘": "辛十四娘", "子楚": "孙子楚", # 其他人物
"赵阿宝": "阿宝", "孙猴子": "孙悟空", "孙行者": "孙悟空" # 通用别名
}
4. 加载停用词表
stop_words = set()
基础停用词(可根据需要扩展)
base_stops = {"不知", "不可", "不敢", "以为", "可以", "如此", "而已", "众人", "说道", "女子"}
stop_words.update(base_stops)
5. 分词 + 同义词合并 + 停用词过滤
def process_text(text):
words = jieba.lcut(text, cut_all=False) # 精确模式
processed = []
for word in words:
if len(word) == 1 or word in stop_words:
continue # 过滤单字和停用词
# 同义词替换:若在词典中则替换,否则保留原词
word = synonym_dict.get(word, word)
processed.append(word)
return processed
words_processed = process_text(text)
6. 统计词频并输出Top20
word_counts = Counter(words_processed)
top20 = word_counts.most_common(20)
print("《聊斋志异》高频词Top20(合并同类词后)😊
for word, count in top20:
print(f"{word}: {count}")