scarpy 自定义ip代理池
1、核心:下载中间件
2、过程:
创建代理中间件,在配置文件中添加代理池的ip列表
修改下载中间件的内容,注意:自定义代理中间件优先级要高于默认的httpproxy中间件间
3、创建ip代理值中间件类
import random from collections import defaultdict from scrapy.exceptions import NotConfigured from twisted.internet.error import ConnectError, TimeoutError class RandomProxyMiddleWare(object): def __init__(self, settings): # 2.初始化配置及相关变量 self.proxies = settings.getlist('PROXIES') self.stats = defaultdict(int) self.max_failed = 3 @classmethod def from_crawler(cls, crawler): # 1. 创建中间件对象 # 默认代理是启用的 if not crawler.settings.getbool('HTTPPROXY_ENABLED'): raise NotConfigured return cls(crawler.settings) def process_request(self, request, spider): # 3. 为每个request对象分配随机的ip代理 if self.proxies and not request.meta.get('proxy') and request.url not in spider.start_urls: request.meta['proxy'] = random.choices(self.proxies) def process_response(self, request, response, spider): # 4.0 请求成功 cur_proxy = request.meta.get('proxy') # 判断ip是否被对方封禁 if response.status in (401, 403): self.stats[cur_proxy] += 1 print('%s got wrong code %s times' % (cur_proxy, self.stats[cur_proxy])) # 当某个IP的失败次数累积到一定的数量 if self.stats[cur_proxy] >= self.max_failed: print('got wrong http code (%s) when use %s' % (response.status, cur_proxy)) # 可以认为该IP被对方封禁。从代理池中将该IP删除 self.remove_proxy(cur_proxy) del request.meta['proxy'] # 返回request 将该请求重新->调度器 return request return response def process_exception(self, request, exception, spider): # 4.1 请求失败 cur_proxy = request.meta.get('proxy') # 请求使用代理,并且网络请求报错,认为该IP出错,删除,并重新->调度器 if cur_proxy and isinstance(cur_proxy, (ConnectError, TimeoutError)): print('error (%s) occur when use proxy %s' % (exception, cur_proxy)) self.remove_proxy(cur_proxy) del request.meta['proxy'] return request def remove_proxy(self, proxy): if proxy in self.proxies: self.proxies.remove(proxy) print('remove %s from proxy list' % proxy)
4、配置settings
DOWNLOAD_TIMEOUT=10
# ip代理池的ip PROXIES = [ 'http://192.169.1.1:8000', ] DOWNLOADER_MIDDLEWARES = { # 自定义的代理ip中间件间,优先级要高于自带的 'toscrapy.middlewares.RandomProxyMiddleWare': 749, }
注意:
a、查看默认的代理的源码
查看scrapy->查看下载中间件间->查看http代理文库
b、查看默认代理的优先级
查看scrapy->查看settings文件->查看default_setting.py
注意:
在settings中设置DOWNLOAD_TIMEOUT
查看scrapy->查看settings文件->查看default_setting.py->DOWNLOAD_TIMEOUT
DOWNLOAD_TIMEOUT=10

浙公网安备 33010602011771号