第八周总结

这一周主要是进行了python的爬虫的练习

首先我爬取热词的地址是博客园:https://news.cnblogs.com/n/recommend

然后在里面筛选出100个出现频率最高的信息热词。

import jieba
import pandas as pd
import re
from collections import Counter
 
if __name__ == '__main__':
    filehandle = open("Hotword.txt", "r", encoding='utf-8');
    mystr = filehandle.read()
    seg_list = jieba.cut(mystr)  # 默认是精确模式
    print(seg_list)
    # all_words = cut_words.split()
    # print(all_words)
    stopwords = {}.fromkeys([line.rstrip() for line in open(r'final.txt',encoding='UTF-8')])
    c = Counter()
    for x in seg_list:
        if x not in stopwords:
            if len(x) > 1 and x != '\r\n':
                c[x] += 1
 
    print('\n词频统计结果:')
    for (k, v) in c.most_common(100):  # 输出词频最高的前两个词
        print("%s:%d" % (k, v))
 
    # print(mystr)
    filehandle.close();
    

final.txt:

posted @ 2022-08-19 23:54  李迎辉  阅读(16)  评论(0)    收藏  举报