第八周总结
这一周主要是进行了python的爬虫的练习
首先我爬取热词的地址是博客园:https://news.cnblogs.com/n/recommend
然后在里面筛选出100个出现频率最高的信息热词。
import jieba import pandas as pd import re from collections import Counter if __name__ == '__main__': filehandle = open("Hotword.txt", "r", encoding='utf-8'); mystr = filehandle.read() seg_list = jieba.cut(mystr) # 默认是精确模式 print(seg_list) # all_words = cut_words.split() # print(all_words) stopwords = {}.fromkeys([line.rstrip() for line in open(r'final.txt',encoding='UTF-8')]) c = Counter() for x in seg_list: if x not in stopwords: if len(x) > 1 and x != '\r\n': c[x] += 1 print('\n词频统计结果:') for (k, v) in c.most_common(100): # 输出词频最高的前两个词 print("%s:%d" % (k, v)) # print(mystr) filehandle.close();
final.txt:


浙公网安备 33010602011771号