学习笔记:网易爬虫---国内新闻(第一次学习记录)

获取新闻列表:

网易——国内新闻,通过JS请求返回数据,每请求一次,URL带的参数都不一样:

cm_guonei_02
# url = 'http://temp.163.com/special/00804KVA/cm_guonei.js?' \
# 'callback=data_callback'
# url = 'http://temp.163.com/special/00804KVA/cm_guonei_02.js?' \
# 'callback=data_callback'

这里只爬取了第一次请求返回的新闻列表!!!(主要是还没想好怎么把所有请求返回的数据汇总起来)

#获取新闻列表

 1 import requests
 2 import json
 3 
 4 def getNewsLink(url):
 5     newslinks = []
 6     res = requests.get(url)
 7     #把回传的str去掉头尾多余字符,处理成list格式
 8     newslist = res.text.lstrip('data_callback(').rstrip(')')
 9     #把str格式转换成list
10     jd = json.loads(newslist)
11     #遍历所有的新闻链接并汇总
12     for link in jd:
13         newslinks.append(link['docurl'])
14     return newslinks

#获取新闻内容

 1 import requests
 2 from bs4 import BeautifulSoup
 3 from datetime import datetime
 4 
 5 
 6 def getDetail(url):
 7     detail = {}
 8     res = requests.get(url)
 9     # print(res.text)
10     soup = BeautifulSoup(res.text,'html.parser')
11     #标题
12     detail['title'] = soup.select('.post_content_main h1')[0].text
13     #时间和来源
14     sources = soup.select('.post_time_source')[0].text.strip().split('来源')
15     time = sources[0].strip()
16     detail['dt'] = datetime.strptime(time,"%Y-%m-%d %H:%M:%S")
17     detail['source'] = sources[1].lstrip(': ')
18     #获取作者
19     detail['editor'] = soup.select('.ep-editor')[0].text.lstrip('责任编辑:')
20     # 获取正文
21     art = []
22     for p in soup.select('.post_text p'):
23             art.append(p.text.strip())
24     detail['article'] = ''.join(art)
25     return detail
26 
27 # url = 'http://news.163.com/18/0514/16/DHPFO0IL0001875N.html'
28 # print(getDetail(url))

 

#汇总数据写入excel

 1 from TEST.news_163 import newsLinkList_163
 2 from TEST.news_163 import article_163
 3 import pandas
 4 
 5 def getNews(url):
 6     newstotal = []
 7     links = newsLinkList_163.getNewsLink(url)
 8     for link in links:
 9         newstotal.append(article_163.getDetail(link))
10     return newstotal
11 
12 url = 'http://temp.163.com/special/00804KVA/cm_guonei.js?' \
13           'callback=data_callback'
14 # print(len(getNews(url)))
15 
16 df = pandas.DataFrame(getNews(url))
17 df.to_excel('163news.xlsx')

 

posted on 2018-05-15 11:43  就那么简单  阅读(197)  评论(0)    收藏  举报

导航