代理池项目
一、代理池概述
免费代理不稳定,通常只有10%可用,便宜收费代理也只有30%-50%的可用,只有代理IP提供商可提供高可用的代理。(智连HTTP)
五、代理池工具模块
1.日志模块,日志代码项目之间可以复用
import sys
import logging
LOG_LEVEL=logging.INFO
LOG_FMT='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s: %(message)s'
LOG_DATEFMT='%Y-%m-%d %H:%M:%S'
LOG_FILENAME='log.log'
class Logger(object):
def __init__(self):
self._logger=logging.getLogger()
self.formatter=logging.Formatter(fmt=LOG_FMT,datefmt=LOG_DATEFMT)
self._logger.addHandler(self._get_file_handler(LOG_FILENAME))
self._logger.addHandler(self._get_console_handler())
self._logger.setLevel(LOG_LEVEL)
def _get_file_handler(self,filename):
filehandler=logging.FileHandler(filename=filename,encoding='utf-8')
filehandler.setFormatter(self.formatter)
return filehandler
def _get_console_handler(self):
console_handler=logging.StreamHandler(sys.stdout)
console_handler.setFormatter(self.formatter)
return console_handler
@property
def logger(self):
return self._logger
logger=Logger().logger
if __name__ == '__main__':
logger.debug('调试信息')
logger.info('状态信息')
logger.warning('警告信息')
logger.error('错误信息')
logger.critical('严重错误信息')
六、代理池校验模块
目的:检查代理IP速度,匿名程度以及支持的协议类型
def check_proxy(proxy):
proxies = {
'http': 'http://{}:{}'.format(proxy.ip, proxy.port),
'https': 'https://{}:{}'.format(proxy.ip, proxy.port),
}
http, http_nick_type, http_speed = _check_http_proxy(proxies)
https, https_nick_type, https_speed = _check_http_proxy(proxies, False)
if https and http:
proxy.protocal = 2
proxy.nick_type = http_nick_type
proxy.speed = http_speed
elif http:
proxy.protocal = 0
proxy.nick_type = http_nick_type
proxy.speed = http_speed
elif https:
proxy.protocal = 1
proxy.nick_type = https_nick_type
proxy.speed = https_speed
else:
proxy.protocal = -1
proxy.nick_type = -1
proxy.speed = -1
logger.debug(proxy)
return proxy
def _check_http_proxy(proxies, isHttp=True):
nitk_type = -1 # 匿名程度
speed = -1 # 响应速度
if isHttp:
test_url = 'http://httpbin.org/get'
else:
test_url = 'https://httpbin.org/get'
try:
start = time.time
r = requests.get(url=test_url, header=get_request_headers(), timeout=TIME_OUT, proxies=proxies)
if r.ok:
speed = round(time.time() - start, 2)
content = json.loads(r.text)
headers = content['headers']
ip = content['origin']
proxry_connection = headers.get('Proxy-Connection', None)
if ',' in ip:
nitk_type = 2 # 透明代理
elif proxry_connection:
nitk_type = 1 # 匿名
else:
nitk_type = 0 # 高匿代理
return True, nitk_type, speed
else:
return False, nitk_type, speed
except Exception as e:
logger.exception(e)
return False, nitk_type, speed
七、代理池数据库模块
class MongoPool(object):
def __init__(self):
self.client=MongoClient(MONGO_URL)
self.proxies=self.client['proxies_pool']['proxies']
def __del__(self):
self.client.close()
八、代理池爬虫模块
1、定时调度模块
@classmethod
def start(cls):
rs=RunSpider()
rs.run()
schedule.every(RUN_SPIDERS_INTERVAL).hour.do(rs.sun)
while True:
schedule.run_pending()
time.sleep(1)
浙公网安备 33010602011771号