学习笔记:网易爬虫---国内新闻(第一次学习记录)
获取新闻列表:
网易——国内新闻,通过JS请求返回数据,每请求一次,URL带的参数都不一样:
cm_guonei_02
# url = 'http://temp.163.com/special/00804KVA/cm_guonei.js?' \
# 'callback=data_callback'
# url = 'http://temp.163.com/special/00804KVA/cm_guonei_02.js?' \
# 'callback=data_callback'
这里只爬取了第一次请求返回的新闻列表!!!(主要是还没想好怎么把所有请求返回的数据汇总起来)
#获取新闻列表
1 import requests 2 import json 3 4 def getNewsLink(url): 5 newslinks = [] 6 res = requests.get(url) 7 #把回传的str去掉头尾多余字符,处理成list格式 8 newslist = res.text.lstrip('data_callback(').rstrip(')') 9 #把str格式转换成list 10 jd = json.loads(newslist) 11 #遍历所有的新闻链接并汇总 12 for link in jd: 13 newslinks.append(link['docurl']) 14 return newslinks
#获取新闻内容
1 import requests 2 from bs4 import BeautifulSoup 3 from datetime import datetime 4 5 6 def getDetail(url): 7 detail = {} 8 res = requests.get(url) 9 # print(res.text) 10 soup = BeautifulSoup(res.text,'html.parser') 11 #标题 12 detail['title'] = soup.select('.post_content_main h1')[0].text 13 #时间和来源 14 sources = soup.select('.post_time_source')[0].text.strip().split('来源') 15 time = sources[0].strip() 16 detail['dt'] = datetime.strptime(time,"%Y-%m-%d %H:%M:%S") 17 detail['source'] = sources[1].lstrip(': ') 18 #获取作者 19 detail['editor'] = soup.select('.ep-editor')[0].text.lstrip('责任编辑:') 20 # 获取正文 21 art = [] 22 for p in soup.select('.post_text p'): 23 art.append(p.text.strip()) 24 detail['article'] = ''.join(art) 25 return detail 26 27 # url = 'http://news.163.com/18/0514/16/DHPFO0IL0001875N.html' 28 # print(getDetail(url))
#汇总数据写入excel
1 from TEST.news_163 import newsLinkList_163 2 from TEST.news_163 import article_163 3 import pandas 4 5 def getNews(url): 6 newstotal = [] 7 links = newsLinkList_163.getNewsLink(url) 8 for link in links: 9 newstotal.append(article_163.getDetail(link)) 10 return newstotal 11 12 url = 'http://temp.163.com/special/00804KVA/cm_guonei.js?' \ 13 'callback=data_callback' 14 # print(len(getNews(url))) 15 16 df = pandas.DataFrame(getNews(url)) 17 df.to_excel('163news.xlsx')