爬虫大作业

1.选一个自己感兴趣的主题或网站。(所有同学不能雷同)

2.用python 编写爬虫程序,从网络上爬取相关主题的数据。

3.对爬了的数据进行文本分析,生成词云。

4.对文本分析结果进行解释说明。

5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。

6.最后提交爬取的全部数据、爬虫及数据分析源代码。

# -*- codding: UTF-8 -*-
# -*- author: WF -*-
import requests
from bs4 import BeautifulSoup
from util.expretion import filter_tags
import time
baseUrl="http://news.gzhu.edu.cn"
url =baseUrl+ "/guangdayaowen/index.html"
def writeNewDetail(content):
    f = open('GZCCnews.txt','a',encoding='utf-8') #a是添加add的意思
    f.write(content)
    f.close()
#
def getList(url):
    linkList=[]
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text,'html.parser')
    list = soup.select('.news_list')[0].select("table")[1].select("li")
    for i in list:
        if len(i.select('a')) > 0:  # 排除为空的li
            list1 = i.select('a')[0].attrs['href']
            print(list1)
            linkList.append(list1)
    print("*******************分页************************")
    return (linkList)
#获取文章总页数
def getCount(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    count = soup.select('.epages')[0].select("a")[0].text
    print(count)
    return count
 #获取一篇文章详情
def getNewDetail(url):
    detail_res = requests.get(url)
    detail_res.encoding = 'utf-8'
    detail_soup = BeautifulSoup(detail_res.text, 'html.parser')  # 打开新闻详情页并解析
    title=detail_soup.select(".title_info")[0].select("h1")[0].text
    text=detail_soup.select("#text")
    comment= filter_tags(str(text))
    return comment
#获取一页的文章详情
def getPageDetail(linkList):
    for item in linkList:
        comment = getNewDetail(baseUrl + item)
        writeNewDetail(comment)
 
if __name__ == '__main__':
    count=getCount(url)
    for item in range(1,int(count)+1):
        linkList = []
        if(item==1):
            linkList = getList(url)
        else:
            linkList = getList(baseUrl+"/guangdayaowen/index" + "_" + str(item) + ".html")
        getPageDetail(linkList)
        print("第:"+str(item)+"")
        time.sleep(1)
 
#生成词云
# -*- codding: UTF-8 -*-
# -*- author: WF -*-
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
import codecs
import numpy as np
from PIL import Image
import re
file = codecs.open('GZCCnews.txt', 'r', 'utf-8')
image=np.array(Image.open('D:/pythonWork/a.jpg'))
font=r'C:\Windows\Fonts\simkai.ttf'
word=file.read()
#去掉英文,保留中文
resultword=re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "",word)
wordlist_after_jieba = jieba.cut(resultword, cut_all = True)
 
wl_space_split = " ".join(wordlist_after_jieba)
print(wl_space_split)
my_wordcloud = WordCloud(font_path=font,mask=image,background_color='black',max_words = 100,max_font_size = 300,random_state=50).generate(wl_space_split)
#根据图片生成词云
iamge_colors = ImageColorGenerator(image)
my_wordcloud.recolor(color_func = iamge_colors)
#显示生成的词云
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()
#保存生成的图片,当关闭图片时才会生效,中断程序不会保存
my_wordcloud.to_file('result.jpg')

 

posted on 2018-04-24 18:56  114周展鹏  阅读(141)  评论(0编辑  收藏  举报

导航