聊斋志异
import jieba
from collections import Counter
import re
设置人物别名映射表(可根据需要扩充)
name_mapping = {
'孙猴子': '孙悟空',
'大圣': '孙悟空',
'猴王': '孙悟空',
'狐仙': '狐狸精',
'狐妖': '狐狸精',
'聂小倩': '小倩',
'宁采臣': '宁生',
# 添加更多别名映射...
}
def load_and_clean_text(file_path):
"""加载文本并进行初步清洗"""
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
# 去除标点符号和特殊字符
text = re.sub(r'[^\w\s]', '', text)
return text
def merge_names(word_list):
"""合并人物别名"""
return [name_mapping.get(word, word) for word in word_list]
def analyze_liaozhai(file_path, top_n=20):
# 加载文本
text = load_and_clean_text(file_path)
# 使用jieba分词
words = jieba.lcut(text)
# 合并人物别名
words = merge_names(words)
# 过滤停用词和单字
stop_words = {'的', '了', '是', '我', '你', '他', '这', '那', '就', '也', '都', '不', '在', '有', '说', '道', '曰'}
filtered_words = [word for word in words if len(word) > 1 and word not in stop_words]
# 统计词频
word_counts = Counter(filtered_words)
# 获取前top_n个高频词
top_words = word_counts.most_common(top_n)
return top_words
使用示例
if name == "main":
# 替换为你的聊斋志异文本文件路径
file_path = 'liaozhai.txt'
try:
top_20_words = analyze_liaozhai(file_path)
print("聊斋志异高频词统计(前20个):")
print("{:<10}{:<10}".format("词语", "频次"))
print("="*20)
for word, count in top_20_words:
print("{:<10}{:<10}".format(word, count))
except FileNotFoundError:
print(f"错误:找不到文件 {file_path}")
except Exception as e:
print(f"发生错误:{str(e)}")

浙公网安备 33010602011771号