scrapy笔记
新建scrapy项目
scrapy startproject [projectName]
开启爬虫
进入项目根目录
scrapy crawl [-a key=value] [spidername] [-o output.json] //输出json格式
scrapy shell
进入项目根目录
scrapy shell "url"
>>
response.body //response=sel
response.xpath()
response.css()
sel.xpath('//title').extract()
其他命令
scrapy fetch --nolog url //下载url内容并输出
scrapy view url //浏览器查看 spider获取的结果
scrapy parse url
CrawlSpider
rule
start_urls中的链接和follow=true的链接会回调parse(scrapy.Request(url, self.parse)), parse()会触发rule去提取返回的response中的链接,提取链接后根据rule中的配置要么(follow=false)提取数据,要么继续提取链接(follow=true)
class scrapy.contrib.spiders.Rule ( link_extractor, callback=None,cb_kwargs=None,follow=None,process_links=None,process_request=None ) #link_extractor为LinkExtractor,用于定义需要提取的链接。 #callback参数:当link_extractor获取到链接时参数所指定的值作为回调函数,避免使用parse作为回调. #follow:指定了根据该规则从response提取的链接是否需要跟进。当callback为None,默认值为true。 #process_links:主要用来过滤由link_extractor获取到的链接。 #process_request:request发送前进行的处理
默认为深度优先
#广度优先设置 DEPTH_PRIORITY = 1 SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
parse方法之间传递参数
def parse_page1(self, response): item = MyItem() item['main_url'] = response.url request = scrapy.Request("http://www.example.com/some_page.html", callback=self.parse_page2) request.meta['item'] = item return request def parse_page2(self, response): item = response.meta['item'] item['other_url'] = response.url return item
url:https://scrapy.readthedocs.org/en/latest/topics/request-response.html#passing-additional-data-to-callback-functions
Selector
>>> response.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)') [u'My image 1', u'My image 2', u'My image 3', u'My image 4', u'My image 5'] url:http://doc.scrapy.org/en/latest/topics/selectors.html
LinkExtractor
classscrapy.contrib.linkextractors.sgml.SgmlLinkExtractor( allow=(),deny=(),allow_domains=(),deny_domains=(),deny_extensions=None,restrict_xpaths=(),tags=('a','area'),attrs=('href'),canonicalize=True,unique=True,process_value=None) #allow:满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配。 #deny:与这个正则表达式(或正则表达式列表)不匹配的URL一定不提取。 #allow_domains:会被提取的链接的domains。 #deny_domains:一定不会被提取链接的domains #restrict_xpaths:使用xpath表达式,在那个部分用allow匹配提取链接,没有则整个body提取
编码
''.encode('utf-8') //中文显示
防ban
- 设置download_delay
class MyCrawlSpider(CrawlSpider): name = "MyCrawlSpider" #设置下载延时 download_delay = 2 allowed_domains = ['....']
- 禁止cookies
在settings.py中设置COOKIES_ENABLES=False。 - 使用user agent池
# -*-coding:utf-8-*- from scrapy import log """避免被ban策略之一:使用useragent池。 使用注意:需在settings.py中进行相应的设置。 """ import random from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware class RotateUserAgentMiddleware(UserAgentMiddleware): def __init__(self, user_agent=''): self.user_agent = user_agent def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: #显示当前使用的useragent print "********Current UserAgent:%s************" %ua #记录 log.msg('Current UserAgent: '+ua, level='INFO') request.headers.setdefault('User-Agent', ua) #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php user_agent_list = [\ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
配置settings.py
#取消默认的useragent,使用新的useragent DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None, 'CSDNBlogCrawlSpider.spiders.rotate_useragent.RotateUserAgentMiddleware' :400 }
- 用IP池
Scrapy+Tor+polipo
http://pkmishra.github.io/blog/2013/03/18/how-to-run-scrapy-with-TOR-and-multiple-browser-agents-part-1-mac/ - 分布式爬取

浙公网安备 33010602011771号