爬虫基础7(框架Scrapy中手动携带cookie)
处理cookie
# -*- coding: utf-8 -*- import scrapy # 帮助我们将Cookie在request中解析出来 from scrapy.http.cookies import CookieJar from scrapy.http import Request class KuaidailiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['https://dig.chouti.com/'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def parse(self, response): # 实例一个CookieJar对象 cookie_jar = CookieJar() # 再响应中获取cookie,将cookie保存在cookie_jar对象中 cookie_jar.extract_cookies(response, response.request) # 去对象中将cookie解析到字典 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value yield Request( url='https://dig.chouti.com/login', method='POST', body='phone=861234567865432123&password=12345432345&oneMonth=1', cookies=self.cookie_dict, callback=self.check_login ) def check_login(self, response): print(response.text) yield Request( url="https://dig.chouti.com/r/pic/hot/1", cookies=self.cookie_dict, callback=self.index ) def index(self, response): print(response.text)
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http.request import Request from scrapy.http.cookies import CookieJar from scrapy import FormRequest class ChouTiSpider(scrapy.Spider): # 爬虫应用的名称,通过此名称启动爬虫命令 name = "chouti" # 允许的域名 allowed_domains = ["chouti.com"] cookie_dict = {} has_request_set = {} def start_requests(self): url = 'http://dig.chouti.com/' # return [Request(url=url, callback=self.login)] yield Request(url=url, callback=self.login) def login(self, response): cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value req = Request( url='http://dig.chouti.com/login', method='POST', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, body='phone=8615131255089&password=pppppppp&oneMonth=1', cookies=self.cookie_dict, callback=self.check_login ) yield req def check_login(self, response): req = Request( url='http://dig.chouti.com/', method='GET', callback=self.show, cookies=self.cookie_dict, dont_filter=True ) yield req def show(self, response): # print(response) hxs = HtmlXPathSelector(response) news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]') for new in news_list: # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract() link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first() yield Request( url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,), method='POST', cookies=self.cookie_dict, callback=self.do_favor ) page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract() for page in page_list: page_url = 'http://dig.chouti.com%s' % page import hashlib hash = hashlib.md5() hash.update(bytes(page_url,encoding='utf-8')) key = hash.hexdigest() if key in self.has_request_set: pass else: self.has_request_set[key] = page_url yield Request( url=page_url, method='GET', callback=self.show ) def do_favor(self, response): print(response.text)
自动携带cookie
meta={"cookiejar":True}

浙公网安备 33010602011771号