scrapy爬取验证码登录网页

scrapy 验证码登录程序,

https://accounts.douban.com/login

 

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 import urllib
 4 
 5 class MydoubanSpider(scrapy.Spider):
 6     name = "mydouban_"
 7 
 8     def __init__(self, ):
 9         super(MydoubanSpider, self).__init__()
10         self.start_urls = ['https://accounts.douban.com/login']
11         self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}
12 
13 
14     def parse(self, response):
15         return [scrapy.Request("https://accounts.douban.com/login",callback=self.Login,meta={"cookiejar":1})]
16 
17     def Login(self,response):
18         captcha = response.xpath("//img[@id='captcha_image']/@src").extract()
19         if len(captcha) > 0:
20             #人工输入验证码  下载验证码的图片
21             urllib.urlretrieve(captcha[0],filename="./captcha.jpg")
22             captcha_value=raw_input('查看captcha.png,有验证码请输入:')
23 
24             data={
25             "form_email": "user",
26             "form_password": "psaaword",
27             "captcha-solution": captcha_value,
28             #"redir": "https://www.douban.com/people/151968962/",      #设置需要转向的网址
29             }
30 
31             return [ scrapy.FormRequest.from_response(response,headers=self.headers, meta={"cookiejar":response.meta["cookiejar"]},
32               # headers=self.header,
33               formdata=data, callback=self.get_content, )]
34             pass
35 
36     def get_content(self,response):
37         print("完成登录.........")
38         test = response.xpath('//*[@id="db-global-nav"]/div/div[1]/ul/li[2]/a/span[1]//text()').extract()
39         print ''.join(test)

 

豆瓣的登录程序

github完整代码链接地址: https://github.com/sea1234/myyangzhengma

 

 

 
posted @ 2017-06-22 13:16  sea101  阅读(2233)  评论(0编辑  收藏  举报