爬虫大作业

1.选一个自己感兴趣的主题。

2.用python 编写爬虫程序,从网络上爬取相关主题的数据。

3.对爬了的数据进行文本分析,生成词云。

4.对文本分析结果进行解释说明。

5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。

6.最后提交爬取的全部数据、爬虫及数据分析源代码。

 

# -*- coding: UTF-8 -*-

import requests
import json
import re
from bs4 import BeautifulSoup
import jieba
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator


# 获取评论数
def getCommentsCounts(newsurl):
    bianhao = re.search('doc-i(.+).shtml', newsurl)
    newsid = bianhao.group(1)
    comment = requests.get(commentURL.format(newsid))
    jd = json.loads(comment.text)
    counts = jd['result']['count']['total']
    return counts


def getNewsDetail(newsurl):
    result = {}
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    # 获取标题
    result['title'] = soup.select(".main-title")[0].text
    # 来源
    result['newssources'] = soup.select('.source')[0].text
    # 时间
    result['timesource'] = soup.select('.date')[0].text
    # 编辑
    result['editor'] = soup.select('.show_author')[0].text.strip('责任编辑:')[-1]
    # 评论数
    result['comments'] = getCommentsCounts(url)
    # 内容
    result['contents'] = soup.select('.article')[0].text.strip()
    # writeNewsContent(content)
    return str(result['contents'])


# 保为 txt
def writeNewsContent(content):
    f = open('news.txt', 'a', encoding='utf-8')
    f.write(content)
    f.close()


def parseListLinks(url):
    newsdetails = []
    res = requests.get(url)
    jss = res.text.lstrip('  newsloadercallback(').rstrip(');')
    jd = json.loads(jss)
    for news in jd['result']['data']:
        allURL = news['url']
        newsdetails.append(getNewsDetail(allURL).split())
    writeNewsContent(str(newsdetails))
    return newsdetails


commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1\
    &format=json&channel=gn&newsid=comos-{}&group=undefined&\
    compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3'
url = 'http://finance.sina.com.cn/chanjing/gsnews/2018-04-29/doc-ifzvpatq7964658.shtml'
listURL = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&\
callback=newsloadercallback&_=1524705663198'
news_total = []
for i in range(1, 2):
    newssurl = listURL.format(i)
    newsary = parseListLinks(newssurl)
    news_total.extend(newsary)
print(len(news_total)) < br > < br > < br >


f = open('content.txt', 'r', encoding='utf-8')
news = f.read()
f.close()

sep = ''',。‘’“”:;()!?、《》[] '''
exclude = {'', '', '', '', '', ''}



for c in sep:
    news = news.replace(c, ' ')
wordList = list(jieba.cut(news))
wordDict = {}
words = list(set(wordList) - exclude)

for w in range(0, len(words)):
    wordDict[words[w]] = news.count(str(words[w]))

dictList = list(wordDict.items())
dictList.sort(key=lambda x: x[1], reverse=True)
cy = {}
f = open('news.txt', 'a', encoding="utf-8")
for i in range(1000):
    print(dictList[i])
    f.write(dictList[i][0] + ':' + str(dictList[i][1]) + '\n')
    cy[dictList[i][0]] = dictList[i][1]
f.close()

font = r'C:\Windows\Fonts\wb.ttf'
image = Image.open('./wordcloud.jpg')
graph = np.array(image)
wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph)
wc.generate_from_frequencies(cy)
image_color = ImageColorGenerator(graph)
plt.imshow(wc)
plt.axis("off")
plt.show()

生成的词云:

posted @ 2018-04-30 19:21  050朱小彬  阅读(209)  评论(0编辑  收藏  举报