爬虫基础10(框架Scrapy中的下载中间件)

 

框架Scrapy中的下载中间件

获取用户设定代理位置源码链接

from urllib.request import getproxies

request.py获取用户设置代理块逻辑

def getproxies_environment():
    """Return a dictionary of scheme -> proxy server URL mappings.

    Scan the environment for variables named <scheme>_proxy;
    this seems to be the standard convention.  If you need a
    different way, you can pass a proxies dictionary to the
    [Fancy]URLopener constructor.

    """
    proxies = {}
    # in order to prefer lowercase variables, process environment in
    # two passes: first matches any, second pass matches lowercase only
    """
    {
        HTTP_PROXY:1.1.1.1,
        HTTPS_PROXY:1.1.1.2,
    }
    """
    proxies={
        "http":1.1.1.1,
        "https":1.1.1.2,
    }
    # os.environ = 一个进程中的环境变量
    for name, value in os.environ.items():
        name = name.lower()
        if value and name[-6:] == '_proxy':
            proxies[name[:-6]] = value
    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
    # header from the client
    # If "proxy" is lowercase, it will still be used thanks to the next block
    if 'REQUEST_METHOD' in os.environ:
        proxies.pop('http', None)
    for name, value in os.environ.items():
        if name[-6:] == '_proxy':
            name = name.lower()
            if value:
                proxies[name[:-6]] = value
            else:
                proxies.pop(name[:-6], None)
    return proxies

httpproxy.py[代理主逻辑源码]

import base64
from six.moves.urllib.request import getproxies, proxy_bypass
from six.moves.urllib.parse import unquote
try:
    from urllib2 import _parse_proxy
except ImportError:
    from urllib.request import _parse_proxy
from six.moves.urllib.parse import urlunparse

from scrapy.utils.httpobj import urlparse_cached
from scrapy.exceptions import NotConfigured
from scrapy.utils.python import to_bytes
# getproxies位置
from urllib.request import getproxies

class HttpProxyMiddleware(object):
 
    def __init__(self, auth_encoding='latin-1'):
        self.auth_encoding = auth_encoding
        self.proxies = {}
        """
        默认代理是从os.environ环境变量中获取,只要遵循_proxy结尾
        getproxies = {
            "http":http://root:woshiniba@192.168.11.11:9999/,
            "https":"1.1.1.2:9999",
        }
        #  base64加密
        self.proxies={
            "http":("basic asdasdadkjsada",192.168.11.11:9999)
        }   "https":(None,1.1.1.2:9999)
        """
        for type, url in getproxies().items():
            self.proxies[type] = self._get_proxy(url, type)

    @classmethod
    def from_crawler(cls, crawler):
        if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
            raise NotConfigured
        auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
        return cls(auth_encoding)

    def _basic_auth_header(self, username, password):
        user_pass = to_bytes(
            '%s:%s' % (unquote(username), unquote(password)),
            encoding=self.auth_encoding)
        return base64.b64encode(user_pass).strip()

    def _get_proxy(self, url, orig_type):
        # url="1.1.1.1",
        # orig_type="http",
        # http://root:woshiniba@192.168.11.11:9999/= _parse_proxy(url)
        proxy_type, user, password, hostport = _parse_proxy(url)
        proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))

        if user:
            creds = self._basic_auth_header(user, password)
        else:
            creds = None

        return creds, proxy_url

    def process_request(self, request, spider):
        # ignore if proxy is already set
        if 'proxy' in request.meta:
            if request.meta['proxy'] is None:
                return
            # extract credentials if present
            creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url
            if creds and not request.headers.get('Proxy-Authorization'):
                request.headers['Proxy-Authorization'] = b'Basic ' + creds
            return
        elif not self.proxies:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return
        # 确认请求中发过来的是http请求还是https请求
        if scheme in self.proxies:
            self._set_proxy(request, scheme)

    def _set_proxy(self, request, scheme):
        """
        加代理
        """
        creds, proxy = self.proxies[scheme]
        request.meta['proxy'] = proxy
        if creds:
            # 如果需要认证就将Proxy-Authorization带过去
            request.headers['Proxy-Authorization'] = b'Basic ' + creds

Scrapy内置代理设置方法:

  代理要在爬虫开始运行时就设置好故设置在start_requests中

def start_requests(self):
        # 定义起始url方式一:[也可以发post请求]yield Request(url=url,method='post')
        for url in self.start_urls:
            yield Request(url=url)
        # 设置代理
        import os
        os.environ['HTTPS_PROXY'] = "http://root:woshiniba@192.168.11.11:9999/"
        os.environ['HTTP_PROXY'] = "192.168.12.12"

scrapy中设置代理

  内置方法

    在爬虫启动时,提前在os.envrion中设置代理即可。 

  方式一

 class ChoutiSpider(scrapy.Spider):
     name = 'chouti'
     allowed_domains = ['chouti.com']
     start_urls = ['https://dig.chouti.com/']
     cookie_dict = {}

     def start_requests(self):
        import os
        os.environ['HTTPS_PROXY'] = "http://root:woshiniba@192.168.11.11:9999/"
        os.environ['HTTP_PROXY'] = '19.11.2.32',
        for url in self.start_urls:
           yield Request(url=url,callback=self.parse)

  方式二:meta

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['https://dig.chouti.com/']
    cookie_dict = {}

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url=url,callback=self.parse,meta={'proxy':'"http://root:woshiniba@192.168.11.11:9999/"'})

  自定义方法

a、定义一个模块(proxy.py)

import base64
import random
from six.moves.urllib.parse import unquote
try:
    from urllib2 import _parse_proxy
except ImportError:
    from urllib.request import _parse_proxy
from six.moves.urllib.parse import urlunparse
from scrapy.utils.python import to_bytes

class MyProxyMiddleware(object):

    def _basic_auth_header(self, username, password):
        user_pass = to_bytes(
            '%s:%s' % (unquote(username), unquote(password)),
            encoding='latin-1')
        return base64.b64encode(user_pass).strip()

    def process_request(self, request, spider):
        PROXIES = [
            "http://root:woshiniba@192.168.11.11:9999/",
            "http://root:woshiniba@192.168.11.12:9999/",
            "http://root:woshiniba@192.168.11.13:9999/",
            "http://root:woshiniba@192.168.11.14:9999/",
            "http://root:woshiniba@192.168.11.15:9999/",
            "http://root:woshiniba@192.168.11.16:9999/",
        ]
        url = random.choice(PROXIES)

        orig_type = ""
        proxy_type, user, password, hostport = _parse_proxy(url)
        proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))

        if user:
            creds = self._basic_auth_header(user, password)
        else:
            creds = None
        request.meta['proxy'] = proxy_url
        if creds:
            request.headers['Proxy-Authorization'] = b'Basic ' + creds

b、修改settings.py的默认配置文件

SPIDER_MIDDLEWARES = {
   # 'www.middlewares.WwwSpiderMiddleware': 543,
       'www.proxy.MyProxyMiddleware'  :543,
}

 

posted @ 2018-07-05 10:25  争-渡  阅读(211)  评论(0)    收藏  举报