六云

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理
import requests, re, jieba
from bs4 import BeautifulSoup
from datetime import datetime

# 获取新闻细节
def getNewsDetail(newsUrl):
resd = requests.get(newsUrl)
resd.encoding = 'gb2312'
soupd = BeautifulSoup(resd.text, 'html.parser')
content = soupd.select('#endText')[0].text
info = soupd.select('.post_time_source')[0].text
date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})', info).group(1) # 识别时间格式
dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') # 用datetime将时间字符串转换为datetime类型
sources = re.search('来源:\s*(.*)', info).group(1)
keyWords = getKeyWords(content)
print('发布时间:{0}\n来源:{1}'.format(dateTime, sources))
print('关键词:{}、{}、{}'.format(keyWords[0], keyWords[1], keyWords[2]))
print(content)
fo = open("D:\python/test.txt", 'a', encoding='utf8')
fo.write(content)
fo.close()



# 通过jieba分词,获取新闻关键词
def getKeyWords(content):
content = ''.join(re.findall('[\u4e00-\u9fa5]', content)) # 通过正则表达式选取中文字符数组,拼接为无标点字符内容
wordSet = set(jieba._lcut(content))
wordDict = {}
for i in wordSet:
wordDict[i] = content.count(i)
deleteList, keyWords = [], []
for i in wordDict.keys():
if len(i) < 2:
deleteList.append(i) # 去掉单字无意义字符
for i in deleteList:
del wordDict[i]
dictList = list(wordDict.items())
dictList.sort(key=lambda item: item[1], reverse=True) # 排序,返回前三关键字
for i in range(3):
keyWords.append(dictList[i][0])
return keyWords



# 获取一页的新闻
def getListPage(listUrl):
res = requests.get(listUrl)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'html.parser')
for new in soup.select('#news-flow-content')[0].select('li'):
url = new.select('a')[0]['href']
title = new.select('a')[0].text
print('标题:{0}\n链接:{1}'.format(title, url))
print(getNewsDetail(url))
# fo = open("D:\python/test.txt", 'a', encoding='utf8')
# fo.write(title)
# fo.close()

break





listUrl = 'http://tech.163.com/it/'
getListPage(listUrl)
for i in range(2, 7):
listUrl = 'http://tech.163.com/special/it_2016_%02d/' % i
getListPage(listUrl)

 因为一直无法下载python中wordcloud的相关文件,于是便使用了网上词云的生成器。网址是:https://timdream.org/wordcloud

 

运行效果图:

 

 

  此次爬取的是一个关于区块链的新闻内容,通过使用python爬取新闻页面的内容,然后使用网上的词云生成器生成词。

posted on 2018-04-30 16:22  流XC  阅读(231)  评论(0编辑  收藏  举报