一个完整的大作业
1.选一个自己感兴趣的主题。
2.网络上爬取相关的数据。
import requests from bs4 import BeautifulSoup import jieba def getHTMLText(url):#获取网页信息 try: r = requests.get(url, timeout = 30) r.raise_for_status() r.encoding = 'bg2312'#尝试过后发现utf-8在我的电脑会产生乱码,bg2312可以使用 return r.text except: return "" def getContent(url):#读取信息 html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser") title = soup.select("div.hd > h1")#标题 print(title[0].get_text()) time = soup.select("div.a_Info > span.a_time")#时间 print(time[0].string) author = soup.select("div.qq_articleFt > div.qq_toolWrap > div.qq_editor")#作者 print(author[0].get_text()) paras = soup.select("div.Cnt-Main-Article-QQ > p.text")#内容 article = { 'Title' : title[0].get_text(), 'Time' : time[0].get_text(), 'Paragraph' : paras, 'Author' : author[0].get_text() } print(article) def main(): url = "https://news.qq.com/a/20171029/022800.htm"#爬取的网址 getContent(url); main()
3.进行文本分析,生成词云。
import jieba txt = open('news.txt',"r").read() ex = {'军队','高级干部','人民军队','必须','政治'} ls = [] words = jieba.lcut(txt) counts = {} for word in words: ls.append(word) if len(word) == 1: continue else: counts[word] = counts.get(word,0)+1 for word in ex: del(counts[word]) items = list(counts.items()) items.sort(key = lambda x:x[1], reverse = True) for i in range(20): word , count = items[i] print ("{:<10}{:>5}".format(word,count))
coding:bg2312 import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt text =open("news.txt",'r').read() wordlist = jieba.cut(text,cut_all=True) wl_split = "/".join(wordlist) mywc = WordCloud().generate(text) plt.imshow(mywc) plt.axis("off") plt.show()
4.对文本分析结果解释说明。
通过爬取网站,将网站内容进行词频分析并生成词云。我们可以很好的看出这篇网站文章中重点在于“党的指挥”、“国防”、“建设”、“忠诚”、“创新”、“从严治军”等等。有利于分析重点词汇!