一个完整的大作业

1.选一个自己感兴趣的主题。

2.网络上爬取相关的数据。

import requests
from bs4 import BeautifulSoup
import jieba

def getHTMLText(url):#获取网页信息
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = 'bg2312'#尝试过后发现utf-8在我的电脑会产生乱码,bg2312可以使用
        return r.text
    except:
        return ""
def getContent(url):#读取信息
    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    title = soup.select("div.hd > h1")#标题
    print(title[0].get_text())
    time = soup.select("div.a_Info > span.a_time")#时间
    print(time[0].string)
    author = soup.select("div.qq_articleFt > div.qq_toolWrap > div.qq_editor")#作者
    print(author[0].get_text())
    paras = soup.select("div.Cnt-Main-Article-QQ > p.text")#内容
    article = {
        'Title' : title[0].get_text(),
        'Time' : time[0].get_text(),
        'Paragraph' : paras,
        'Author' : author[0].get_text()
    }
    print(article)
def main():
    url = "https://news.qq.com/a/20171029/022800.htm"#爬取的网址
    getContent(url);
main()

3.进行文本分析,生成词云。

import jieba

txt = open('news.txt',"r").read()
ex = {'军队','高级干部','人民军队','必须','政治'}

ls = []
words = jieba.lcut(txt)
counts = {}
for word in words:
    ls.append(word)
    if len(word) == 1:
        continue
    else:
        counts[word] = counts.get(word,0)+1
        
for word in ex:
    del(counts[word])
    
items = list(counts.items())
items.sort(key = lambda x:x[1], reverse = True)
for i in range(20):
    word , count = items[i]
    print ("{:<10}{:>5}".format(word,count))

coding:bg2312
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text =open("news.txt",'r').read()

wordlist = jieba.cut(text,cut_all=True)
wl_split = "/".join(wordlist)

mywc = WordCloud().generate(text)
plt.imshow(mywc)
plt.axis("off")
plt.show()

4.对文本分析结果解释说明。

     通过爬取网站,将网站内容进行词频分析并生成词云。我们可以很好的看出这篇网站文章中重点在于“党的指挥”、“国防”、“建设”、“忠诚”、“创新”、“从严治军”等等。有利于分析重点词汇!

posted @ 2017-10-26 11:50  06胡思琪  阅读(287)  评论(1编辑  收藏  举报