crawlspider爬虫案例
CrawlSpider
CrawlSpider是spider的一个子类,Spider是它的父类;
作用:被用作专业实现全站数据爬取,将一个页面下所有的页码对应的数据进行爬取。
示例
需求1:
爬取指南针找房网租房板块下,所有房源列表中的标题&详情页的简介,需要翻页爬取;
将爬取到的标题和详情已json格式保存到本地;
步骤:
创建爬虫工程:scrapy startproject house
创建crawlspider:scrapy genspider -t crawl znhouse www.house.com
spiders/znhouse.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from znhousePro.items import ZnhouseproItem class ZnhouseSpider(CrawlSpider): name = 'zls' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.compass.com.kh/cn/buy-listing'] # 链接提取器 # follow指定了根据该规则从response提取的链接是否需要跟进 rules = ( Rule(LinkExtractor(allow=r'/?page=\d+$'), callback='parse_item', follow=True), ) def parse_item(self, response): item = ZnhouseproItem() section_list = response.xpath('/html/body/div[5]/div[2]/div[1]/ul[2]/li/div[1]') print(len(section_list)) for li in section_list: title = li.xpath('./a/div[2]/div[1]/div[1]/div[1]/span[@class="line-clamp1"]/text()').extract_first() detail_url = li.xpath('./a/@href').extract_first() item["title"] = title yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item}) def parse_detail(self, response): """ 解析房源详情页面,房源简介 :param response: :return: """ desc = response.xpath('/html/body/div[5]/div[3]/div[1]/div[3]/div[5]/pre/text()').extract_first() item = response.meta["item"] item["desc"] = desc yield item
items.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class ZnhouseproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() desc = scrapy.Field()
pipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface
import json
from itemadapter import ItemAdapter class HouseproPipeline: def open_spider(self,spider): self.fp = open("港湾置业标题和简介.json","w+",encoding="utf-8")
def process_item(self, item, spider): # 保存到redis中 # conn = spider.conn # redis链接对象 # conn.lpush('movieData',item) item = dict(item) item = json.dumps(item,ensure_ascii=False) + ",\n" self.fp.write(item) return item def close_spider(self,spider): self.fp.close()
settings.py
# Scrapy settings for zlsPro project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'zlsPro' SPIDER_MODULES = ['zlsPro.spiders'] NEWSPIDER_MODULE = 'zlsPro.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = 'zlsPro (+http://www.yourdomain.com)' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = "ERROR" # 定义只输出错误的日志 # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'zlsPro.middlewares.ZlsproSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'zlsPro.middlewares.ZlsproDownloaderMiddleware': 543, # } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'house.pipelines.ZlsproPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
需求2:
爬取目标网站列表中的标题&详情页的简介,需要翻页爬取;
将爬取到的标题保存到本地txt文件,去重操作;
爬虫文件:
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from zlsPro.items import ZlsproItem class ZlsSpider(CrawlSpider): name = 'zls' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.compass.com.kh/cn/buy-listing'] # 链接提取器 # follow指定了根据该规则从response提取的链接是否需要跟进 rules = ( Rule(LinkExtractor(allow=r'/?page=\d+$'), callback='parse_item', follow=True), ) li_title = set() # 集合去重 def parse_item(self, response): item = ZlsproItem() section_list = response.xpath('/html/body/div[5]/div[2]/div[1]/ul[2]/li/div[1]')for li in section_list: title = li.xpath('./a/div[2]/div[1]/div[1]/div[1]/span[@class="line-clamp1"]/text()').extract_first() detail_url = li.xpath('./a/@href').extract_first() self.li_title.add(title.strip()) for i in self.li_title: item["title"] = i yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item}) def parse_detail(self, response): """ 解析房源详情页面,房源简介 :param response: :return: """ desc = response.xpath('/html/body/div[5]/div[3]/div[1]/div[3]/div[5]/pre/text()').extract_first() item = response.meta["item"] item["desc"] = desc yield item
pipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter import json class ZlsproPipeline: def open_spider(self, spider): self.fp = open("港湾置业标题和简介.txt", "w+", encoding="utf-8") def process_item(self, item, spider): for i in spider.li_title: self.fp.write("{0},\n".format(i)) return item def close_spider(self, spider): self.fp.close()