scrapy爬虫系列之六--模拟登录
功能点:如何发送携带cookie访问登录后的页面,如何发送post请求登录
爬取网站:bilibili、github
完整代码:https://files.cnblogs.com/files/bookwed/login.zip
主要代码:
bili.py
# -*- coding: utf-8 -*- import scrapy import re class BiliSpider(scrapy.Spider): """直接携带cookie访问登录后的bilibili页面""" name = 'bili' allowed_domains = ['bilibili.com'] # 登录后的个人主页 start_urls = ['https://account.bilibili.com/home/userInfo'] def start_requests(self): cookies = "_uuid=738F48A9-E13A-9445-3577-3068FADC9F6A05981infoc; buvid3=5DE9F436-F051-44E1-9B97-AB53E60C3ED448999infoc;" cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split("; ")} # 把cookies字符串放到headers里面传参,这种方式不行,要单独传cookies参数 # headers={"Cookie": cookies} print(cookies) yield scrapy.Request( self.start_urls[0], callback=self.parse, cookies=cookies, # headers=headers ) def parse(self, response): # 验证是否成功 print("*"*30) print(re.findall("bookwed", response.body.decode())) print("*"*30) # yield scrapy.FormRequest( # "http://", # headers=self, # formdata=dict(), # callback=self.after_login # )
github.py
# -*- coding: utf-8 -*- import scrapy import re class GithubSpider(scrapy.Spider): """利用scrapy发送post请求,模拟登录github""" """注意点:针对form表单有action地址的情况,可以直接请求action,参考github2.py""" name = 'github' allowed_domains = ['github.com'] start_urls = ['https://github.com/login'] def parse(self, response): authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first() commit = response.xpath("//input[@name='commit']/@value").extract_first() utf8 = response.xpath("//input[@name='utf8']/@value").extract_first() webauthn_support = response.xpath("//input[@name='webauthn-support']/@value").extract_first() # login = response.xpath("//input[@name='login']/@value").extract_first() # password = response.xpath("//input[@name='password']/@value").extract_first() post_data = dict( login="aa@163.com", password="aaaaaa", commit=commit, utf8=utf8, authenticity_token=authenticity_token, webauthn_support=webauthn_support ) yield scrapy.FormRequest( "https://github.com/session", #发送post请求登录接口 formdata=post_data, callback=self.after_login ) # 另外一种发送post请求的方式:指定请求方式为POST # yield scrapy.Request( # "https://github.com/session", # method='POST', # body= # ) def after_login(self,response): # 对于不太确认的情况,可以先把响应保存到本地,然后进行分析 # with open('aa.html', 'w', encoding='utf-8') as f: # f.write(response.body.decode()) print("*"*30) print(re.findall('wed', response.body.decode())) print("*"*30)
github2.py
# -*- coding: utf-8 -*- import scrapy import re class Github2Spider(scrapy.Spider): """对于form表单有action地址的情况,可以直接请求action,只用传用户名密码即可""" name = 'github2' allowed_domains = ['github.com'] start_urls = ['https://github.com/login'] # 注意:针对网页中有多个form的情况,可以通过传参来指定form,如formname、formid、formnumber、formxpath def parse(self, response): yield scrapy.FormRequest.from_response( response, # scrapy会从response中自动寻找form表单 formdata={"login": "aa@163.com", "password": "aaaaaa"}, # key对应页面上的name,value对应实际的值 callback=self.after_login ) def after_login(self, response): print("*" * 30) print(re.findall('wed', response.body.decode())) print("*" * 30)