import jieba from collections import Counter import urllib.request # 下载红楼梦文本(如果本地没有) url = "https://raw.githubusercontent.com/noname2048/hongloumeng/master/hongloumeng.txt" try: response = urllib.request.urlopen(url) content = response.read().decode('utf-8') except: # 如果网络下载失败,尝试从本地文件读取 try: with open('hongloumeng.txt', 'r', encoding='utf-8') as f: content = f.read() except FileNotFoundError: print("请确保有hongloumeng.txt文件在相同目录下,或检查网络连接") exit() # 使用jieba进行分词 words = jieba.lcut(content) # 过滤掉标点符号和单个字符(可根据需要调整) filtered_words = [] for word in words: if len(word) > 1 and not word.isspace(): # 只保留长度大于1的词 filtered_words.append(word) # 统计词频 word_counts = Counter(filtered_words) # 获取出现频率最高的20个词 top_20_words = word_counts.most_common(20) # 打印结果 print("《红楼梦》中出现频率最高的20个词:") for i, (word, count) in enumerate(top_20_words, 1): print(f"{i}. {word}: {count}次") # 可选:将结果保存到文件 with open('hongloumeng_word_freq.txt', 'w', encoding='utf-8') as f: f.write("《红楼梦》中出现频率最高的20个词:\n") for i, (word, count) in enumerate(top_20_words, 1): f.write(f"{i}. {word}: {count}次\n")