class ProxyDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self):
self.request_proxy_url = ""
self.IpPool = Queue() # 维护代理IP池
self.Ipset = set() # 记录已经取到的代理IP
self.request_proxry(number=5)
def request_proxry(self, number=5):
"""
:param number: the number of getting proxies
:return:
"""
if self.IPportQueue.qsize() > 8:
return
url = 'https://dps.kdlapi.com/api/getdps/?orderid=987658645908252&num=%d&pt=1&dedup=1&format=json&sep=1' % number
# lock.acquire()
r = requests.get(url)
# lock.release()
dc = r.json()
# print(dc,'123')
for item in dc['data']['proxy_list']:
if item in self.IPset:
continue
self.IPportQueue.put({'ipport': item, 'useTimes': 0})
self.IPset.add(item)
print(item, '+++++++++++++++++')
def get_proxy_ip(self):
item = self.IpPool.get()
item["useTimes"] += 1
if item["useTimes"] > 10:
self.request_proxry(number=2)
else:
self.IpPool.put(item)
return "https://" + item["ip_port"]
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
request.meta["proxy"] = self.get_proxy_ip()
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)