python词云:聊斋
import jieba.posseg as pseg
import collections
from wordcloud import WordCloud
import matplotlib.pyplot as plt
定义人物别名映射表(以聊斋为例,也包含你提到的例子以防万一)
ALIAS_MAP = {
# 聊斋志异常见人物别名
"婴宁": "婴宁",
"小倩": "聂小倩",
"聂小倩": "聂小倩",
"宁采臣": "宁采臣",
"宁生": "宁采臣", # 书中常称宁生
"燕赤霞": "燕赤霞",
"燕生": "燕赤霞", # 燕赤霞常被称为燕生
"娇娜": "娇娜",
"孔生": "孔雪笠", # 孔雪笠常被称为孔生
"孔雪笠": "孔雪笠",
"青凤": "青凤",
"耿去病": "耿去病",
"耿生": "耿去病",
"侠女": "侠女",
"红玉": "红玉",
"冯生": "冯相如", # 冯相如常被称为冯生
"冯相如": "冯相如",
"连城": "连城",
"乔生": "乔大年", # 乔大年常被称为乔生
"乔大年": "乔大年",
"辛十四娘": "辛十四娘",
"十四娘": "辛十四娘",
"冯生": "冯相如",
"孙子楚": "孙子楚",
"阿宝": "阿宝",
"瑞云": "瑞云",
"贺生": "贺生",
"细柳": "细柳",
"黄英": "黄英",
"马子才": "马子才",
"马生": "马子才",
"白秋练": "白秋练",
"慕生": "慕蟾宫",
"慕蟾宫": "慕蟾宫",
"晚霞": "晚霞",
"阿端": "阿端",
"王桂庵": "王桂庵",
"王生": "王桂庵", # 根据具体篇章可能有多个王生,这里仅作示例
"芸娘": "芸娘",
"席方平": "席方平",
"促织": "成名",
"成名": "成名",
"胭脂": "胭脂",
"鄂生": "鄂秋隼",
"鄂秋隼": "鄂秋隼",
# 你提到的例子(用于测试)
"孙悟空": "孙悟空",
"孙猴子": "孙悟空",
"孙行者": "孙悟空",
"行者": "孙悟空",
}
建议的停用词(根据实际文本可增减)
CUSTOM_STOP_WORDS = {
"一个", "不知", "不能", "如此", "如何", "什么", "起来",
"只是", "可以", "这个", "那个", "出来", "那里", "这里",
"知道", "自己", "看见", "只见", "不是", "没有", "他们",
"我们", "这个", "那个", "什么", "怎么", "倘若", "忽然",
"于是", "然后", "虽然", "但是", "因为", "所以", "因此",
"之后", "以后", "之前", "已经", "正在", "将要", "可能",
"应该", "必须", "一定", "所有", "一些", "许多", "这个",
"这种", "那样", "这样", "那个", "那些", "这些", "一切",
"所有", "任何", "每个", "各自", "另外", "其他", "其它",
}
def load_and_segment(file_path):
"""读取文件并进行分词和词性标注"""
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
提取人名和关键词
words = pseg.cut(text)
return words
def merge_aliases(words_with_flag):
"""合并同义词/别名"""
merged_words = []
for word, flag in words_with_flag:
# 优先按别名表映射
if word in ALIAS_MAP:
merged_words.append((ALIAS_MAP[word], flag))
else:
merged_words.append((word, flag))
return merged_words
def get_top_n_people(words_with_flag, top_n=20):
"""统计出现次数最多的前N个人名"""
# 过滤出人名(nr 词性标签)
people = [word for word, flag in words_with_flag
if flag == 'nr' and len(word) > 1 and word not in CUSTOM_STOP_WORDS]
统计词频
counter = collections.Counter(people)
return counter.most_common(top_n)
def generate_wordcloud(word_freq_dict, output_path='liaozhai_wordcloud.png'):
"""生成词云图"""
wc = WordCloud(
font_path='simhei.ttf', # 中文字体,需指定
width=800,
height=600,
background_color='white',
max_words=100
)
wc.generate_from_frequencies(word_freq_dict)
plt.figure(figsize=(10, 8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('《聊斋志异》高频人物词云', fontsize=20)
plt.tight_layout()
plt.savefig(output_path, dpi=150)
plt.show()
def main():
# 假设聊斋文本已保存为 liaozhai.txt
file_path = 'liaozhai.txt'
print("正在分词...")
words_with_flag = load_and_segment(file_path)
print("正在合并别名...")
merged_words = merge_aliases(words_with_flag)
print("正在统计高频人物...")
top_20 = get_top_n_people(merged_words, 20)
print("\n=== 《聊斋志异》出现次数最多的20个人物 ===")
for i, (name, count) in enumerate(top_20, 1):
print(f"{i:2d}. {name}: {count}次")
生成词云
print("\n正在生成词云...")
word_freq = dict(top_20)
generate_wordcloud(word_freq)
if name == "main":
main()

浙公网安备 33010602011771号