scrapy的解析器,代理的使用方式

scrapy的解析器
html = """<!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id='i1' href="link.html">first item</a></li> <li class="item-0"><a id='i2' href="llink.html">first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> </body> </html> """ from scrapy.http import HtmlResponse response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8') # hxs = Selector(response) # hxs.xpath() response.xpath('')
CookieJar的使用
import
scrapy from scrapy.http.response.html import HtmlResponse from scrapy.http import Request from scrapy.http.cookies import CookieJar class ChoutiSpider(scrapy.Spider): name = "chouti" allowed_domains = ["chouti.com"] start_urls = ( 'http://www.chouti.com/', ) def start_requests(self): url = 'http://dig.chouti.com/' yield Request(url=url, callback=self.login, meta={'cookiejar': True}) def login(self, response): # print(response.headers.getlist('Set-Cookie')) req = Request( url='http://dig.chouti.com/login', method='POST', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, body='phone=8613121758648&password=woshiniba&oneMonth=1', callback=self.check_login, meta={'cookiejar': True} ) yield req def check_login(self, response): print(response.text)
代理的使用方式
自定义

#
by luffycity.com import base64 import random from six.moves.urllib.parse import unquote try: from urllib2 import _parse_proxy except ImportError: from urllib.request import _parse_proxy from six.moves.urllib.parse import urlunparse from scrapy.utils.python import to_bytes 方式一 class XdbProxyMiddleware(object): def _basic_auth_header(self, username, password): user_pass = to_bytes( '%s:%s' % (unquote(username), unquote(password)), encoding='latin-1') return base64.b64encode(user_pass).strip() def process_request(self, request, spider): PROXIES = [ "http://root:woshiniba@192.168.11.11:9999/", "http://root:woshiniba@192.168.11.12:9999/", "http://root:woshiniba@192.168.11.13:9999/", "http://root:woshiniba@192.168.11.14:9999/", "http://root:woshiniba@192.168.11.15:9999/", "http://root:woshiniba@192.168.11.16:9999/", ] url = random.choice(PROXIES) orig_type = "" proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: creds = self._basic_auth_header(user, password) else: creds = None request.meta['proxy'] = proxy_url if creds: request.headers['Proxy-Authorization'] = b'Basic ' + creds 方式二 class DdbProxyMiddleware(object): def process_request(self, request, spider): PROXIES = [ {'ip_port': '111.11.228.75:80', 'user_pass': ''}, {'ip_port': '120.198.243.22:80', 'user_pass': ''}, {'ip_port': '111.8.60.9:8123', 'user_pass': ''}, {'ip_port': '101.71.27.120:80', 'user_pass': ''}, {'ip_port': '122.96.59.104:80', 'user_pass': ''}, {'ip_port': '122.224.249.122:8088', 'user_pass': ''}, ] proxy = random.choice(PROXIES) if proxy['user_pass'] is not None: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) encoded_user_pass = base64.b64encode(to_bytes(proxy['user_pass'])) request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass) else: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
方式三
内置
                在爬虫启动时,提前在os.envrion中设置代理即可。
                    class ChoutiSpider(scrapy.Spider):
                        name = 'chouti'
                        allowed_domains = ['chouti.com']
                        start_urls = ['https://dig.chouti.com/']
                        cookie_dict = {}

                        def start_requests(self):
                            import os
                            os.environ['HTTPS_PROXY'] = "http://root:woshiniba@192.168.11.11:9999/"
                            os.environ['HTTP_PROXY'] = '19.11.2.32',
                            for url in self.start_urls:
                                yield Request(url=url,callback=self.parse)
meta:
                    class ChoutiSpider(scrapy.Spider):
                        name = 'chouti'
                        allowed_domains = ['chouti.com']
                        start_urls = ['https://dig.chouti.com/']
                        cookie_dict = {}

                        def start_requests(self):
                            for url in self.start_urls:
                                yield Request(url=url,callback=self.parse,meta={'proxy':'"http://root:woshiniba@192.168.11.11:9999/"'})
settings的配置
限制深度
DEPTH_LIMIT = 3
# 修改默认的去重规则
# DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
# DUPEFILTER_CLASS = 'xdb.dupefilters.XdbDupeFilter'


 

posted @ 2018-07-02 11:28  liang哥哥  阅读(156)  评论(0)    收藏  举报