自定义proxy.py
import base64
import random
from urllib.parse import unquote, urlunparse
from urllib.request import _parse_proxy
from scrapy.utils.python import to_bytes
class XdbProxyMiddleware:
def _basic_auth_header(self, username, password):
user_pass = to_bytes(
'%s:%s' % (unquote(username), unquote(password)),
encoding='latin-1')
return base64.b64encode(user_pass)
def process_request(self, request, spider):
PROXIES = [
'http://125.108.106.165:9000/',
]
url = random.choice(PROXIES)
orig_type = ''
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
if user:
creds = self._basic_auth_header(user, password)
else:
creds = None
# 给请求头里面换成随机代理IP
request.meta['proxy'] = proxy_url
print(request.meta.get('proxy'))
if creds:
# 加入代理用户认证
request.headers['Proxy-Authorization'] = b'Basic ' + creds
在配置文件中
settings.py
DOWNLOADER_MIDDLEWARES = {
# 'xdb.middlewares.XdbDownloaderMiddleware': 543,
'xdb.proxy.XdbProxyMiddleware': 751,
# 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}