import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://chouti.com/']
cookie_dict={}
def start_requests(self):
'''
循环将url和回调函数封装成request对象放入调度器
:return:
'''
for url in self.start_urls:
yield Request(url,dont_filter=True,callback=self.parse1)
def parse1(self, response):
#获取cookie对象
from scrapy.http.cookies import CookieJar
cookie_jar = CookieJar()
cookie_jar.extract_cookies(response,response.request)
for k, v in cookie_jar._cookies.items():
for i, j in v.items():
for m, n in j.items():
self.cookie_dict[m] = n.value
post_dict={
'phone':'',
'password':'',
'oneMonth':1,
}
# 登陆抽屉
import urllib.parse
yield Request(
url='http://dig.chouti.com/login',
method='POST',
cookies=self.cookie_dict,
body=urllib.parse.urlencode(post_dict),
headers={'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8'},
callback=self.parse2
)
def parse2(self, response):
'''
获取抽屉新闻页
:param response:
:return:
'''
yield Request(url='http://dig.chouti.com/',cookies=self.cookie_dict,callback=self.parse3)
def parse3(self, response):
#获取每页的新闻点赞ID生成链接并点赞
hxs=Selector(response)
link_id_list=hxs.xpath('//div[@class="part2"]/@share-linkid').extract()
for link_id in link_id_list:
url='http://dig.chouti.com/link/vote/?linksId=%s'%(link_id,)
yield Request(url=url,method='POST',cookies=self.cookie_dict,callback=self.parse4)
#获取所有新闻页面的链接
page_list=hxs.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
for url in page_list:
page_url='http://dig.chouti.com%s'%(url,)
yield Request(page_url,method='GET',callback=self.parse3)
def parse4(self, response):
print(response.text)