import jieba
from collections import Counter
import urllib.request

# 下载红楼梦文本(如果本地没有)
url = "https://raw.githubusercontent.com/noname2048/hongloumeng/master/hongloumeng.txt"
try:
    response = urllib.request.urlopen(url)
    content = response.read().decode('utf-8')
except:
    # 如果网络下载失败,尝试从本地文件读取
    try:
        with open('hongloumeng.txt', 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print("请确保有hongloumeng.txt文件在相同目录下,或检查网络连接")
        exit()

# 使用jieba进行分词
words = jieba.lcut(content)

# 过滤掉标点符号和单个字符(可根据需要调整)
filtered_words = []
for word in words:
    if len(word) > 1 and not word.isspace():  # 只保留长度大于1的词
        filtered_words.append(word)

# 统计词频
word_counts = Counter(filtered_words)

# 获取出现频率最高的20个词
top_20_words = word_counts.most_common(20)

# 打印结果
print("《红楼梦》中出现频率最高的20个词:")
for i, (word, count) in enumerate(top_20_words, 1):
    print(f"{i}. {word}: {count}次")

# 可选:将结果保存到文件
with open('hongloumeng_word_freq.txt', 'w', encoding='utf-8') as f:
    f.write("《红楼梦》中出现频率最高的20个词:\n")
    for i, (word, count) in enumerate(top_20_words, 1):
        f.write(f"{i}. {word}: {count}次\n")

 

posted on 2025-06-23 11:32  1235yyq  阅读(11)  评论(0)    收藏  举报