聊斋志异

import jieba
from collections import Counter
import re

设置人物别名映射表(可根据需要扩充)

name_mapping = {
'孙猴子': '孙悟空',
'大圣': '孙悟空',
'猴王': '孙悟空',
'狐仙': '狐狸精',
'狐妖': '狐狸精',
'聂小倩': '小倩',
'宁采臣': '宁生',
# 添加更多别名映射...
}

def load_and_clean_text(file_path):
"""加载文本并进行初步清洗"""
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()

# 去除标点符号和特殊字符
text = re.sub(r'[^\w\s]', '', text)
return text

def merge_names(word_list):
"""合并人物别名"""
return [name_mapping.get(word, word) for word in word_list]

def analyze_liaozhai(file_path, top_n=20):
# 加载文本
text = load_and_clean_text(file_path)

# 使用jieba分词
words = jieba.lcut(text)

# 合并人物别名
words = merge_names(words)

# 过滤停用词和单字
stop_words = {'的', '了', '是', '我', '你', '他', '这', '那', '就', '也', '都', '不', '在', '有', '说', '道', '曰'}
filtered_words = [word for word in words if len(word) > 1 and word not in stop_words]

# 统计词频
word_counts = Counter(filtered_words)

# 获取前top_n个高频词
top_words = word_counts.most_common(top_n)

return top_words

使用示例

if name == "main":
# 替换为你的聊斋志异文本文件路径
file_path = 'liaozhai.txt'

try:
    top_20_words = analyze_liaozhai(file_path)
    
    print("聊斋志异高频词统计(前20个):")
    print("{:<10}{:<10}".format("词语", "频次"))
    print("="*20)
    for word, count in top_20_words:
        print("{:<10}{:<10}".format(word, count))
except FileNotFoundError:
    print(f"错误:找不到文件 {file_path}")
except Exception as e:
    print(f"发生错误:{str(e)}")
posted @ 2025-06-22 13:14  cchb  阅读(15)  评论(0)    收藏  举报