爬虫基础10(框架Scrapy中的下载中间件)
框架Scrapy中的下载中间件
获取用户设定代理位置源码链接
from urllib.request import getproxies
request.py获取用户设置代理块逻辑
def getproxies_environment(): """Return a dictionary of scheme -> proxy server URL mappings. Scan the environment for variables named <scheme>_proxy; this seems to be the standard convention. If you need a different way, you can pass a proxies dictionary to the [Fancy]URLopener constructor. """ proxies = {} # in order to prefer lowercase variables, process environment in # two passes: first matches any, second pass matches lowercase only """ { HTTP_PROXY:1.1.1.1, HTTPS_PROXY:1.1.1.2, } """ proxies={ "http":1.1.1.1, "https":1.1.1.2, } # os.environ = 一个进程中的环境变量 for name, value in os.environ.items(): name = name.lower() if value and name[-6:] == '_proxy': proxies[name[:-6]] = value # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY # (non-all-lowercase) as it may be set from the web server by a "Proxy:" # header from the client # If "proxy" is lowercase, it will still be used thanks to the next block if 'REQUEST_METHOD' in os.environ: proxies.pop('http', None) for name, value in os.environ.items(): if name[-6:] == '_proxy': name = name.lower() if value: proxies[name[:-6]] = value else: proxies.pop(name[:-6], None) return proxies
httpproxy.py[代理主逻辑源码]
import base64 from six.moves.urllib.request import getproxies, proxy_bypass from six.moves.urllib.parse import unquote try: from urllib2 import _parse_proxy except ImportError: from urllib.request import _parse_proxy from six.moves.urllib.parse import urlunparse from scrapy.utils.httpobj import urlparse_cached from scrapy.exceptions import NotConfigured from scrapy.utils.python import to_bytes # getproxies位置 from urllib.request import getproxies class HttpProxyMiddleware(object): def __init__(self, auth_encoding='latin-1'): self.auth_encoding = auth_encoding self.proxies = {} """ 默认代理是从os.environ环境变量中获取,只要遵循_proxy结尾 getproxies = { "http":http://root:woshiniba@192.168.11.11:9999/, "https":"1.1.1.2:9999", } # base64加密 self.proxies={ "http":("basic asdasdadkjsada",192.168.11.11:9999) } "https":(None,1.1.1.2:9999) """ for type, url in getproxies().items(): self.proxies[type] = self._get_proxy(url, type) @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('HTTPPROXY_ENABLED'): raise NotConfigured auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING') return cls(auth_encoding) def _basic_auth_header(self, username, password): user_pass = to_bytes( '%s:%s' % (unquote(username), unquote(password)), encoding=self.auth_encoding) return base64.b64encode(user_pass).strip() def _get_proxy(self, url, orig_type): # url="1.1.1.1", # orig_type="http", # http://root:woshiniba@192.168.11.11:9999/= _parse_proxy(url) proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: creds = self._basic_auth_header(user, password) else: creds = None return creds, proxy_url def process_request(self, request, spider): # ignore if proxy is already set if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return # 确认请求中发过来的是http请求还是https请求 if scheme in self.proxies: self._set_proxy(request, scheme) def _set_proxy(self, request, scheme): """ 加代理 """ creds, proxy = self.proxies[scheme] request.meta['proxy'] = proxy if creds: # 如果需要认证就将Proxy-Authorization带过去 request.headers['Proxy-Authorization'] = b'Basic ' + creds
Scrapy内置代理设置方法:
代理要在爬虫开始运行时就设置好故设置在start_requests中
def start_requests(self): # 定义起始url方式一:[也可以发post请求]yield Request(url=url,method='post') for url in self.start_urls: yield Request(url=url) # 设置代理 import os os.environ['HTTPS_PROXY'] = "http://root:woshiniba@192.168.11.11:9999/" os.environ['HTTP_PROXY'] = "192.168.12.12"
scrapy中设置代理
内置方法
在爬虫启动时,提前在os.envrion中设置代理即可。
方式一
class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def start_requests(self): import os os.environ['HTTPS_PROXY'] = "http://root:woshiniba@192.168.11.11:9999/" os.environ['HTTP_PROXY'] = '19.11.2.32', for url in self.start_urls: yield Request(url=url,callback=self.parse)
方式二:meta
class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def start_requests(self): for url in self.start_urls: yield Request(url=url,callback=self.parse,meta={'proxy':'"http://root:woshiniba@192.168.11.11:9999/"'})
自定义方法
a、定义一个模块(proxy.py)
import base64 import random from six.moves.urllib.parse import unquote try: from urllib2 import _parse_proxy except ImportError: from urllib.request import _parse_proxy from six.moves.urllib.parse import urlunparse from scrapy.utils.python import to_bytes class MyProxyMiddleware(object): def _basic_auth_header(self, username, password): user_pass = to_bytes( '%s:%s' % (unquote(username), unquote(password)), encoding='latin-1') return base64.b64encode(user_pass).strip() def process_request(self, request, spider): PROXIES = [ "http://root:woshiniba@192.168.11.11:9999/", "http://root:woshiniba@192.168.11.12:9999/", "http://root:woshiniba@192.168.11.13:9999/", "http://root:woshiniba@192.168.11.14:9999/", "http://root:woshiniba@192.168.11.15:9999/", "http://root:woshiniba@192.168.11.16:9999/", ] url = random.choice(PROXIES) orig_type = "" proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: creds = self._basic_auth_header(user, password) else: creds = None request.meta['proxy'] = proxy_url if creds: request.headers['Proxy-Authorization'] = b'Basic ' + creds
b、修改settings.py的默认配置文件
SPIDER_MIDDLEWARES = { # 'www.middlewares.WwwSpiderMiddleware': 543, 'www.proxy.MyProxyMiddleware' :543, }

浙公网安备 33010602011771号