scrapy爬虫系列之六--模拟登录

功能点：如何发送携带cookie访问登录后的页面，如何发送post请求登录

爬取网站：bilibili、github

完整代码：https://files.cnblogs.com/files/bookwed/login.zip

主要代码：

bili.py

# -*- coding: utf-8 -*-
import scrapy
import re


class BiliSpider(scrapy.Spider):
    """直接携带cookie访问登录后的bilibili页面"""
    name = 'bili'
    allowed_domains = ['bilibili.com']
    # 登录后的个人主页
    start_urls = ['https://account.bilibili.com/home/userInfo']

    def start_requests(self):
        cookies = "_uuid=738F48A9-E13A-9445-3577-3068FADC9F6A05981infoc; buvid3=5DE9F436-F051-44E1-9B97-AB53E60C3ED448999infoc;"
        cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split("; ")}
        # 把cookies字符串放到headers里面传参，这种方式不行，要单独传cookies参数
        # headers={"Cookie": cookies}
        print(cookies)
        yield scrapy.Request(
            self.start_urls[0],
            callback=self.parse,
            cookies=cookies,
            # headers=headers
        )


    def parse(self, response):
        # 验证是否成功
        print("*"*30)
        print(re.findall("bookwed", response.body.decode()))
        print("*"*30)

        # yield scrapy.FormRequest(
        #     "http://",
        #     headers=self,
        #     formdata=dict(),
        #     callback=self.after_login
        # )

github.py

# -*- coding: utf-8 -*-
import scrapy
import re


class GithubSpider(scrapy.Spider):
    """利用scrapy发送post请求，模拟登录github"""
    """注意点：针对form表单有action地址的情况，可以直接请求action，参考github2.py"""
    name = 'github'
    allowed_domains = ['github.com']
    start_urls = ['https://github.com/login']

    def parse(self, response):
        authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
        commit = response.xpath("//input[@name='commit']/@value").extract_first()
        utf8 = response.xpath("//input[@name='utf8']/@value").extract_first()
        webauthn_support = response.xpath("//input[@name='webauthn-support']/@value").extract_first()
        # login = response.xpath("//input[@name='login']/@value").extract_first()
        # password = response.xpath("//input[@name='password']/@value").extract_first()
        post_data = dict(
            login="aa@163.com",
            password="aaaaaa",
            commit=commit,
            utf8=utf8,
            authenticity_token=authenticity_token,
            webauthn_support=webauthn_support
        )
        yield scrapy.FormRequest(
            "https://github.com/session",   #发送post请求登录接口
            formdata=post_data,
            callback=self.after_login
        )

        # 另外一种发送post请求的方式：指定请求方式为POST
        # yield scrapy.Request(
        #     "https://github.com/session",
        #     method='POST',
        #     body=
        # )

    def after_login(self,response):
        # 对于不太确认的情况，可以先把响应保存到本地，然后进行分析
        # with open('aa.html', 'w', encoding='utf-8') as f:
        #     f.write(response.body.decode())
        print("*"*30)
        print(re.findall('wed', response.body.decode()))
        print("*"*30)

github2.py

# -*- coding: utf-8 -*-
import scrapy
import re


class Github2Spider(scrapy.Spider):
    """对于form表单有action地址的情况，可以直接请求action，只用传用户名密码即可"""
    name = 'github2'
    allowed_domains = ['github.com']
    start_urls = ['https://github.com/login']

    # 注意：针对网页中有多个form的情况，可以通过传参来指定form，如formname、formid、formnumber、formxpath
    def parse(self, response):
        yield scrapy.FormRequest.from_response(
            response,   # scrapy会从response中自动寻找form表单
            formdata={"login": "aa@163.com", "password": "aaaaaa"},     # key对应页面上的name，value对应实际的值
            callback=self.after_login
        )

    def after_login(self, response):
        print("*" * 30)
        print(re.findall('wed', response.body.decode()))
        print("*" * 30)

posted @ 2019-04-03 14:19 bookwed 阅读(901) 评论(0) 收藏举报

刷新页面返回顶部

bookwed

scrapy爬虫系列之六--模拟登录

公告