scrapy中间件使用
1.更换访问ip和user_agent
user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] PROXY_http = [ '36.248.129.214:9999', '175.43.33.3:9999', ] PROXY_https = [ '120.83.49.90:9000', '95.189.112.214:35508', ] @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s # 拦截请求 def process_request(self, request, spider): # UA伪装 request.headers['UserAgent'] = random.choice(self.user_agent_list) # 为了验证代理的操作是否生效 request.headers['proxy'] = '175.42.129.20:8771' print(request.headers['proxy']) return None # 拦截所有的响应 def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response # 拦截发生异常的请求 def process_exception(self, request, exception, spider): # 代理IP if request.url.split(':')[0] == 'http': request.headers['proxy'] = random.choice(self.PROXY_http) else: request.headers['proxy'] = random.choice(self.PROXY_https) print("更换"+request.headers['proxy']) return request # 将修正之后的请求对象重新的进行请求
2.返回的响应不是想要的响应的时候,使用selenium来进行返回
这里是Middle中下载中间件的部分
# 该方法拦截五大板块对应的响应对象,进行篡改 def process_response(self, request, response, spider): # 挑选出指定的响应对象进行篡改 # 通过url指定request # 通过request指定response if request.url in spider.model_urls: # response # 五大板块对应的响应对象 # 针对定位到的这些response进行篡改 # 实例化一个新的响应对象(符合需求:包含动态加载出的新闻数据),替代原来旧的响应对象 # 如何获取动态加载出来的新闻数据?基于selenium便捷的获取动态加载数据 bro = spider.bro bro.get(url=request.url) sleep(2) # 这里就拿到了包含动态加载的新闻数据 page_text = bro.page_source new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request) return new_response else: # response # 其他请求对应的响应对象 return response
还需要更改一下spider
# 实例化一个浏览器对象 def __init__(self): self.bro = webdriver.Chrome( executable_path=r'E:\PycharmProjects\FirstProject\scrapy_project\wangyiPro\wangyiPro\spiders\chromedriver.exe')

浙公网安备 33010602011771号