scrapy之自带图片管道下载

ImagesPipeline是scrapy自带的类,用来处理图片(爬取时将图片下载到本地)用的。

优势:

  1. 将下载图片转换成通用的JPG和RGB格式
  2. 避免重复下载
  3. 缩略图生成
  4. 图片大小过滤
  5. 异步下载
  6. ......

工作流程:

  1. 爬取一个Item,将图片的URLs放入image_urls字段
  2. Spider返回的Item,传递到Item Pipeline
  3. Item传递到ImagePipeline,将调用Scrapy 调度器和下载器完成image_urls中的url的调度和下载。
  4. 图片下载成功结束后,图片下载路径、url和校验和等信息会被填充到images字段中。

实现方式:

  1. 自定义pipeline,优势在于可以重写ImagePipeline类中的实现方法,可以根据情况对照片进行分类;
  2. 直接使用ImagePipeline类,简单但不够灵活;所有的图片都是保存在full文件夹下,不能进行分类

 

代码示范

1.无非类版本

# -*- coding: utf-8 -*-
import scrapy
from bmw.items import BmwItem


class BmwHcSpider(scrapy.Spider):
    name = 'bmw_hc'
    # allowed_domains = ['https://car.autohome.com.cn/pic/series/65.html']
    start_urls = ['https://car.autohome.com.cn/pic/series/65.html']

    def parse(self, response):
        uiboxs = response.xpath('//div[@class="column grid-16"]//div[@class="uibox"]')[1:]

        for uibox in uiboxs:
            category = uibox.xpath("./div[@class='uibox-title']/a/text()").get()
            urls = uibox.xpath('.//ul/li/a/img/@src').getall()
            urls = list(map(lambda x: response.urljoin(x), urls))

            item = BmwItem(category=category, image_urls=urls)
            yield item
spider文件
import scrapy


class BmwItem(scrapy.Item):
    # define the fields for your item here like:
    category = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()
item
ITEM_PIPELINES = {
    # 'bmw.pipelines.BmwPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline': 1
}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')


# 核心时开启默认管道 和 路径,
setting

2.分类版本

在之前基础修改的

from scrapy.pipelines.images import ImagesPipeline
from bmw.settings import IMAGES_STORE
import os


class BmwPipeline(ImagesPipeline):  # 此时继承了scrapy自己的图片下载管道

    def get_media_requests(self, item, info):  # 覆盖父类方法(请求之前调用)
        requests_objs = super().get_media_requests(item, info)  # 发送图片请求前的方法
        for request_obj in requests_objs:
            request_obj.item = item  # 给每个request对象 赋值item
        return requests_objs
    def file_path(self, request, response=None, info=None):
        path = super().file_path(request, response, info)  # 默认下载路径

        category = request.item.get('category')  # 种类
        images_store = IMAGES_STORE  # 放置路径
        category_path = os.path.join(images_store, category)  # 种类路径

        if not category_path:  # 创建
            os.mkdir(category_path)

        image_name = path.replace('full/', '')  # 替换原路径的full路径
        image_path = os.path.join(category_path, image_name)
        return image_path
管道
ITEM_PIPELINES = {
    'bmw.pipelines.BmwPipeline': 300,
    # 'scrapy.pipelines.images.ImagesPipeline': 1
}
setting

3.全部图片下载

需要改成crawlspider的

ITEM_PIPELINES = {
    'bmw.pipelines.BmwPipeline': 300,
    # 'scrapy.pipelines.images.ImagesPipeline': 1
}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
setting中
import scrapy


class BmwItem(scrapy.Item):
    # define the fields for your item here like:
    category = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()
item
import scrapy
from bmw.items import BmwItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


class BmwHcSpider(CrawlSpider):
    name = 'bmw_hc'
    # allowed_domains = ['xxx']
    start_urls = ['https://car.autohome.com.cn/pic/series/65.html']

    link = LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/65.+')
    rules = (
        Rule(link, callback='parse_page', follow=True),

    )

    def parse_page(self, response):
        category = response.xpath("//div[@class='uibox']/div/text()").get()
        print(category)
        srcs = response.xpath('//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall()
        srcs = list(map(lambda x: response.urljoin(x.replace('t_', '')), srcs))

        yield BmwItem(category=category, image_urls=srcs)
spider
from scrapy.pipelines.images import ImagesPipeline
from bmw.settings import IMAGES_STORE
import os


class BmwPipeline(ImagesPipeline):  # 此时继承了scrapy自己的图片下载管道

    def get_media_requests(self, item, info):  # 覆盖父类方法(请求之前调用)
        requests_objs = super().get_media_requests(item, info)  # 发送图片请求前的方法
        for request_obj in requests_objs:
            request_obj.item = item  # 给每个request对象 赋值item
        return requests_objs
    def file_path(self, request, response=None, info=None):
        path = super().file_path(request, response, info)  # 默认下载路径

        category = request.item.get('category')  # 种类
        images_store = IMAGES_STORE  # 放置路径
        category_path = os.path.join(images_store, category)  # 种类路径

        if not category_path:  # 创建
            os.mkdir(category_path)

        image_name = path.replace('full/', '')  # 替换原路径的full路径
        image_path = os.path.join(category_path, image_name)
        return image_path
管道

 

posted @ 2019-05-10 23:04  洛丶丶丶  阅读(167)  评论(0)    收藏  举报