点击查看代码
import jieba
import re
from collections import Counter
# 读取《聊斋志异》文本
def read_liaozhai(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except FileNotFoundError:
print(f"错误:未找到文件 {file_path}")
return ""
# 自定义分词词典和停用词
def load_custom_resources():
# 添加自定义词汇
jieba.add_word('孙猴子')
jieba.add_word('孙悟空')
jieba.add_word('婴宁')
jieba.add_word('王子服')
# 可继续添加更多聊斋相关词汇
# 停用词列表
stopwords = {'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这'}
return stopwords
# 合并同义词
def merge_synonyms(word_counts):
# 建立同义词映射表
synonyms = {
'孙猴子': '孙悟空',
'齐天大圣': '孙悟空',
# 可添加更多同义词对,例如:
# '绛雪': '香玉',
}
merged_counts = Counter()
for word, count in word_counts.items():
if word in synonyms:
merged_counts[synonyms[word]] += count
else:
merged_counts[word] += count
return merged_counts
# 主函数
def main():
# 请替换为实际的《聊斋志异》文本文件路径
file_path = 'liaozhai.txt'
# 读取文本
text = read_liaozhai(file_path)
if not text:
return
# 加载自定义资源
stopwords = load_custom_resources()
# 分词
words = jieba.cut(text)
# 过滤停用词和无意义字符
filtered_words = []
for word in words:
# 过滤单个字符和停用词
if len(word) > 1 and word not in stopwords:
# 过滤非中文字符
if re.match(r'^[\u4e00-\u9fa5]+$', word):
filtered_words.append(word)
# 统计词频
word_counts = Counter(filtered_words)
# 合并同义词
merged_counts = merge_synonyms(word_counts)
# 获取出现次数最高的20个分词
top_20 = merged_counts.most_common(20)
# 输出结果
print("《聊斋志异》出现次数最高的20个分词:")
for rank, (word, count) in enumerate(top_20, 1):
print(f"{rank}. {word}: {count}次")
if __name__ == "__main__":
main()