吉阿吉

代理池项目

一、代理池概述

   免费代理不稳定,通常只有10%可用,便宜收费代理也只有30%-50%的可用,只有代理IP提供商可提供高可用的代理。(智连HTTP)

 

 

五、代理池工具模块

  1.日志模块,日志代码项目之间可以复用

       

import sys
import logging


LOG_LEVEL=logging.INFO
LOG_FMT='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s: %(message)s'
LOG_DATEFMT='%Y-%m-%d %H:%M:%S'
LOG_FILENAME='log.log'



class Logger(object):
def __init__(self):
self._logger=logging.getLogger()
self.formatter=logging.Formatter(fmt=LOG_FMT,datefmt=LOG_DATEFMT)
self._logger.addHandler(self._get_file_handler(LOG_FILENAME))
self._logger.addHandler(self._get_console_handler())
self._logger.setLevel(LOG_LEVEL)


def _get_file_handler(self,filename):
filehandler=logging.FileHandler(filename=filename,encoding='utf-8')
filehandler.setFormatter(self.formatter)
return filehandler


def _get_console_handler(self):
console_handler=logging.StreamHandler(sys.stdout)
console_handler.setFormatter(self.formatter)
return console_handler


@property
def logger(self):
return self._logger

logger=Logger().logger

if __name__ == '__main__':
logger.debug('调试信息')
logger.info('状态信息')
logger.warning('警告信息')
logger.error('错误信息')
logger.critical('严重错误信息')

 六、代理池校验模块

         目的:检查代理IP速度,匿名程度以及支持的协议类型

   

def check_proxy(proxy):
proxies = {
'http': 'http://{}:{}'.format(proxy.ip, proxy.port),
'https': 'https://{}:{}'.format(proxy.ip, proxy.port),
}
http, http_nick_type, http_speed = _check_http_proxy(proxies)
https, https_nick_type, https_speed = _check_http_proxy(proxies, False)
if https and http:
proxy.protocal = 2
proxy.nick_type = http_nick_type
proxy.speed = http_speed
elif http:
proxy.protocal = 0
proxy.nick_type = http_nick_type
proxy.speed = http_speed
elif https:
proxy.protocal = 1
proxy.nick_type = https_nick_type
proxy.speed = https_speed
else:
proxy.protocal = -1
proxy.nick_type = -1
proxy.speed = -1
logger.debug(proxy)
return proxy


def _check_http_proxy(proxies, isHttp=True):
nitk_type = -1 # 匿名程度
speed = -1 # 响应速度
if isHttp:
test_url = 'http://httpbin.org/get'
else:
test_url = 'https://httpbin.org/get'

try:
start = time.time
r = requests.get(url=test_url, header=get_request_headers(), timeout=TIME_OUT, proxies=proxies)
if r.ok:
speed = round(time.time() - start, 2)
content = json.loads(r.text)
headers = content['headers']
ip = content['origin']
proxry_connection = headers.get('Proxy-Connection', None)

if ',' in ip:
nitk_type = 2 # 透明代理
elif proxry_connection:
nitk_type = 1 # 匿名
else:
nitk_type = 0 # 高匿代理
return True, nitk_type, speed
else:
return False, nitk_type, speed

except Exception as e:
logger.exception(e)
return False, nitk_type, speed

 

七、代理池数据库模块  

     

class MongoPool(object):

def __init__(self):
self.client=MongoClient(MONGO_URL)
self.proxies=self.client['proxies_pool']['proxies']


def __del__(self):
self.client.close()

 

 

八、代理池爬虫模块

   1、定时调度模块

@classmethod
def start(cls):
rs=RunSpider()
rs.run()
schedule.every(RUN_SPIDERS_INTERVAL).hour.do(rs.sun)
while True:
schedule.run_pending()
time.sleep(1)

posted on 2021-04-20 21:10  吉阿吉  阅读(104)  评论(0)    收藏  举报

导航