爬虫大作业
1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
import requests import re import jieba from bs4 import BeautifulSoup from datetime import datetime def getNewsDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = 'gb2312' soupd = BeautifulSoup(resd.text, 'html.parser') content = soupd.select('#endText')[0].text info = soupd.select('.post_time_source')[0].text date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})', info).group(1) dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') sources = re.search('来源:\s*(.*)', info).group(1) TopWords = getTopWords(content) print('发布时间:{0}\n来源:{1}'.format(dateTime, sources)) print('关键词:{}、{}、{}、{}、{}'.format(TopWords[0], TopWords[1], TopWords[2],TopWords[3],TopWords[4])) print(content) print('---------------------------') fo = open("D:\python/test.txt", 'a', encoding='utf8') fo.write('内容:') fo.write(content) fo.write('\n') fo.write('-----------------------------------------------') fo.write('\n') fo.close() def getTopWords(content): str = '''一!“”,。?;’"',.、:\n''' for s in str: content=content.replace(s, ' ') wordlist = list(jieba.cut(content)) exclude = {'这', '\u3000', '\r', '\xa0','时候','对','上','与','等','不','','没有','很多','的','大','出来', '_', '到',' ', '将', '在', '是', '了', '一', '还', '也', '《', '》', '(', ')','和','我','我们','其','能够','以','个','短','中','是','不是'} set2 = set(wordlist) - exclude dict = {} for key in set2: dict[key] = wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) return dictlist; def getListPage(listUrl): res = requests.get(listUrl) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') for new in soup.select('#news-flow-content')[0].select('li'): url = new.select('a')[0]['href'] title = new.select('a')[0].text print('标题:{0}\n链接:{1}'.format(title, url)) getNewsDetail(url) break listUrl = 'http://tech.163.com/internet/' getListPage(listUrl) for i in range(2, 9): listUrl = 'http://tech.163.com/special/internet_2016_%02d/' % i getListPage(listUrl)