大作业
1.选一个自己感兴趣的主题。
2.网络上爬取相关的数据。
3.进行文本分析,生成词云。
4.对文本分析结果解释说明。
5.写一篇完整的博客,附上源代码、数据爬取及分析结果,形成一个可展示的成果。
我选取的感兴趣的主题是关于万圣节的新闻 http://ent.qq.com/a/20171029/027525.htm,然后用python语言在网络上爬取相关的数据。
import requests from bs4 import BeautifulSoup import jieba import matplotlib.pyplot as plt def getHTMLText(url): try: r = requests.get(url, timeout = 25) r.raise_for_status() #r.encoding = 'utf-8' return r.text except: return "" def getContent(url): html = getHTMLText(url) # print(html) soup = BeautifulSoup(html, "html.parser") title = soup.select("div.hd > h1") print(title[0].get_text()) time = soup.select("div.a_Info > span.a_time") print(time[0].string) author = soup.select("div.qq_articleFt > div.qq_toolWrap > div.qq_editor") print(author[0].get_text()) paras = soup.select("div.Cnt-Main-Article-QQ > p.text") for para in paras: if len(para) > 0: print(para.get_text()) print() fo = open("can.txt", "w+") fo.writelines(title[0].get_text() + "\n") fo.writelines(time[0].get_text() + "\n") for para in paras: if len(para) > 0: fo.writelines(para.get_text() + "\n\n") fo.writelines(author[0].get_text() + '\n') fo.close() article = { 'Title' : title[0].get_text(), 'Time' : time[0].get_text(), 'Paragraph' : paras, 'Author' : author[0].get_text() } print(article) def main(): url = "http://ent.qq.com/a/20171029/027525.htm" getContent(url); main()
进行文本分析,生成词云。
import requests from bs4 import BeautifulSoup from wordcloud import WordCloud import re import sys import jieba import matplotlib.pyplot as plt non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) txt = open('can.txt',"r",encoding='utf-8').read() ex = {} ls = [] words = jieba.lcut(txt) counts = {} for word in words: ls.append(word) if len(word) == 1: continue else: counts[word] = counts.get(word,0)+1 for word in ex: del(counts[word]) items = list(counts.items()) items.sort(key = lambda x:x[1], reverse = True) for i in range(20): word , count = items[i] print ("{:<10}{:>5}".format(word,count)) mywc = WordCloud().generate(word) plt.imshow(mywc) plt.axis("off") plt.show()
对文本分析结果解释说明。
从词云中可以看出,万圣节的新闻主要是关于游戏活动