scarpy 自定义ip代理池

1、核心:下载中间件

2、过程:

  创建代理中间件,在配置文件中添加代理池的ip列表

  修改下载中间件的内容,注意:自定义代理中间件优先级要高于默认的httpproxy中间件间

3、创建ip代理值中间件类

 

import random
from collections import defaultdict
from scrapy.exceptions import NotConfigured
from twisted.internet.error import ConnectError, TimeoutError


class RandomProxyMiddleWare(object):
    def __init__(self, settings):
        # 2.初始化配置及相关变量
        self.proxies = settings.getlist('PROXIES')
        self.stats = defaultdict(int)
        self.max_failed = 3

    @classmethod
    def from_crawler(cls, crawler):
        # 1. 创建中间件对象
        # 默认代理是启用的
        if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
            raise NotConfigured
        return cls(crawler.settings)

    def process_request(self, request, spider):
        # 3. 为每个request对象分配随机的ip代理
        if self.proxies and not request.meta.get('proxy') and request.url not in spider.start_urls:
            request.meta['proxy'] = random.choices(self.proxies)

    def process_response(self, request, response, spider):
        # 4.0 请求成功
        cur_proxy = request.meta.get('proxy')
        # 判断ip是否被对方封禁
        if response.status in (401, 403):
            self.stats[cur_proxy] += 1
            print('%s got wrong code %s times' % (cur_proxy, self.stats[cur_proxy]))
        # 当某个IP的失败次数累积到一定的数量
        if self.stats[cur_proxy] >= self.max_failed:
            print('got wrong http code (%s) when use %s' % (response.status, cur_proxy))
            # 可以认为该IP被对方封禁。从代理池中将该IP删除
            self.remove_proxy(cur_proxy)
            del request.meta['proxy']
            # 返回request 将该请求重新->调度器
            return request
        return response

    def process_exception(self, request, exception, spider):
        # 4.1 请求失败
        cur_proxy = request.meta.get('proxy')
        # 请求使用代理,并且网络请求报错,认为该IP出错,删除,并重新->调度器
        if cur_proxy and isinstance(cur_proxy, (ConnectError, TimeoutError)):
            print('error (%s) occur when use proxy %s' % (exception, cur_proxy))
            self.remove_proxy(cur_proxy)
            del request.meta['proxy']
            return request

    def remove_proxy(self, proxy):
        if proxy in self.proxies:
            self.proxies.remove(proxy)
            print('remove %s from proxy list' % proxy)

 

 

 

4、配置settings

DOWNLOAD_TIMEOUT=10
#
ip代理池的ip PROXIES = [ 'http://192.169.1.1:8000', ] DOWNLOADER_MIDDLEWARES = { # 自定义的代理ip中间件间,优先级要高于自带的 'toscrapy.middlewares.RandomProxyMiddleWare': 749, }

注意:

a、查看默认的代理的源码

查看scrapy->查看下载中间件间->查看http代理文库

b、查看默认代理的优先级

查看scrapy->查看settings文件->查看default_setting.py

 注意

在settings中设置DOWNLOAD_TIMEOUT

查看scrapy->查看settings文件->查看default_setting.py->DOWNLOAD_TIMEOUT

DOWNLOAD_TIMEOUT=10

 

posted @ 2019-11-08 23:56  市丸银  阅读(141)  评论(0)    收藏  举报