crawlspider爬虫案例

CrawlSpider

CrawlSpider是spider的一个子类，Spider是它的父类；

作用：被用作专业实现全站数据爬取，将一个页面下所有的页码对应的数据进行爬取。

示例

需求1：

爬取指南针找房网租房板块下，所有房源列表中的标题&详情页的简介，需要翻页爬取；

将爬取到的标题和详情已json格式保存到本地；

步骤：

创建爬虫工程：scrapy startproject house

创建crawlspider：scrapy genspider -t crawl znhouse www.house.com

spiders/znhouse.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from redis import Redis

from znhousePro.items import ZnhouseproItem


class ZnhouseSpider(CrawlSpider):
    name = 'zls'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.compass.com.kh/cn/buy-listing']
    # 链接提取器
    # follow指定了根据该规则从response提取的链接是否需要跟进
    rules = (
        Rule(LinkExtractor(allow=r'/?page=\d+$'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = ZnhouseproItem()
        section_list = response.xpath('/html/body/div[5]/div[2]/div[1]/ul[2]/li/div[1]')
        print(len(section_list))
        for li in section_list:
            title = li.xpath('./a/div[2]/div[1]/div[1]/div[1]/span[@class="line-clamp1"]/text()').extract_first()
            detail_url = li.xpath('./a/@href').extract_first()
            item["title"] = title
            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})

    def parse_detail(self, response):
        """
        解析房源详情页面，房源简介
        :param response:
        :return:
        """
        desc = response.xpath('/html/body/div[5]/div[3]/div[1]/div[3]/div[5]/pre/text()').extract_first()
        item = response.meta["item"]
        item["desc"] = desc
        yield item

items.py　　

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ZnhouseproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    desc = scrapy.Field()

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface

import json

from itemadapter import ItemAdapter

class HouseproPipeline:

    def open_spider(self,spider):
        self.fp = open("港湾置业标题和简介.json","w+",encoding="utf-8")

    def process_item(self, item, spider):
        # 保存到redis中
        # conn = spider.conn # redis链接对象
        # conn.lpush('movieData',item)
        item = dict(item)
        item = json.dumps(item,ensure_ascii=False) + ",\n"
        self.fp.write(item)
        return item

    def close_spider(self,spider):
        self.fp.close()

settings.py

# Scrapy settings for zlsPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'zlsPro'

SPIDER_MODULES = ['zlsPro.spiders']
NEWSPIDER_MODULE = 'zlsPro.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'zlsPro (+http://www.yourdomain.com)'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

LOG_LEVEL = "ERROR"  # 定义只输出错误的日志

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'zlsPro.middlewares.ZlsproSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'zlsPro.middlewares.ZlsproDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'house.pipelines.ZlsproPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

需求2：

爬取目标网站列表中的标题&详情页的简介，需要翻页爬取；

将爬取到的标题保存到本地txt文件，去重操作；

爬虫文件：

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from redis import Redis

from zlsPro.items import ZlsproItem


class ZlsSpider(CrawlSpider):
    name = 'zls'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.compass.com.kh/cn/buy-listing']
    # 链接提取器
    # follow指定了根据该规则从response提取的链接是否需要跟进
    rules = (
        Rule(LinkExtractor(allow=r'/?page=\d+$'), callback='parse_item', follow=True),
    )
    li_title = set() # 集合去重

    def parse_item(self, response):
        item = ZlsproItem()
        section_list = response.xpath('/html/body/div[5]/div[2]/div[1]/ul[2]/li/div[1]')for li in section_list:
            title = li.xpath('./a/div[2]/div[1]/div[1]/div[1]/span[@class="line-clamp1"]/text()').extract_first()
            detail_url = li.xpath('./a/@href').extract_first()
            self.li_title.add(title.strip())
            for i in self.li_title:
                item["title"] = i

            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})

    def parse_detail(self, response):
        """
        解析房源详情页面，房源简介
        :param response:
        :return:
        """
        desc = response.xpath('/html/body/div[5]/div[3]/div[1]/div[3]/div[5]/pre/text()').extract_first()
        item = response.meta["item"]
        item["desc"] = desc
        yield item

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import json


class ZlsproPipeline:

    def open_spider(self, spider):
        self.fp = open("港湾置业标题和简介.txt", "w+", encoding="utf-8")

    def process_item(self, item, spider):
        for i in spider.li_title:
            self.fp.write("{0},\n".format(i))
        return item

    def close_spider(self, spider):
        self.fp.close()

posted on 2022-07-01 15:43 赛兔子阅读(46) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

赛兔子

crawlspider爬虫案例

CrawlSpider

示例

导航

公告