爬虫基础7(框架Scrapy中手动携带cookie)

处理cookie

# -*- coding: utf-8 -*-
import scrapy
# 帮助我们将Cookie在request中解析出来
from scrapy.http.cookies import CookieJar
from scrapy.http import Request


class KuaidailiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['https://dig.chouti.com/']
    start_urls = ['https://dig.chouti.com/']
    cookie_dict = {}

    def parse(self, response):
        # 实例一个CookieJar对象
        cookie_jar = CookieJar()
        # 再响应中获取cookie，将cookie保存在cookie_jar对象中
        cookie_jar.extract_cookies(response, response.request)
        # 去对象中将cookie解析到字典
        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value
        yield Request(
            url='https://dig.chouti.com/login',
            method='POST',
            body='phone=861234567865432123&password=12345432345&oneMonth=1',
            cookies=self.cookie_dict,
            callback=self.check_login
        )

    def check_login(self, response):
        print(response.text)
        yield Request(
            url="https://dig.chouti.com/r/pic/hot/1",
            cookies=self.cookie_dict,
            callback=self.index
        )

    def index(self, response):
        print(response.text)

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.http.cookies import CookieJar
from scrapy import FormRequest


class ChouTiSpider(scrapy.Spider):
    # 爬虫应用的名称，通过此名称启动爬虫命令
    name = "chouti"
    # 允许的域名
    allowed_domains = ["chouti.com"]

    cookie_dict = {}
    has_request_set = {}

    def start_requests(self):
        url = 'http://dig.chouti.com/'
        # return [Request(url=url, callback=self.login)]
        yield Request(url=url, callback=self.login)

    def login(self, response):
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response, response.request)
        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value

        req = Request(
            url='http://dig.chouti.com/login',
            method='POST',
            headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
            body='phone=8615131255089&password=pppppppp&oneMonth=1',
            cookies=self.cookie_dict,
            callback=self.check_login
        )
        yield req

    def check_login(self, response):
        req = Request(
            url='http://dig.chouti.com/',
            method='GET',
            callback=self.show,
            cookies=self.cookie_dict,
            dont_filter=True
        )
        yield req

    def show(self, response):
        # print(response)
        hxs = HtmlXPathSelector(response)
        news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
        for new in news_list:
            # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract()
            link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
            yield Request(
                url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,),
                method='POST',
                cookies=self.cookie_dict,
                callback=self.do_favor
            )

        page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
        for page in page_list:

            page_url = 'http://dig.chouti.com%s' % page
            import hashlib
            hash = hashlib.md5()
            hash.update(bytes(page_url,encoding='utf-8'))
            key = hash.hexdigest()
            if key in self.has_request_set:
                pass
            else:
                self.has_request_set[key] = page_url
                yield Request(
                    url=page_url,
                    method='GET',
                    callback=self.show
                )

    def do_favor(self, response):
        print(response.text)

示例：自动登陆抽屉并点赞

自动携带cookie

meta={"cookiejar":True}

posted @ 2018-07-04 16:27 争-渡阅读(635) 评论(0) 收藏举报

刷新页面返回顶部

争-渡

爬虫基础7(框架Scrapy中手动携带cookie)

处理cookie

公告