scrapy之自带图片管道下载

ImagesPipeline是scrapy自带的类，用来处理图片（爬取时将图片下载到本地）用的。

优势：

将下载图片转换成通用的JPG和RGB格式
避免重复下载
缩略图生成
图片大小过滤
异步下载
......

工作流程：

爬取一个Item，将图片的URLs放入image_urls字段
从Spider返回的Item，传递到Item Pipeline
当Item传递到ImagePipeline，将调用Scrapy 调度器和下载器完成image_urls中的url的调度和下载。
图片下载成功结束后，图片下载路径、url和校验和等信息会被填充到images字段中。

实现方式：

自定义pipeline，优势在于可以重写ImagePipeline类中的实现方法，可以根据情况对照片进行分类；
直接使用ImagePipeline类，简单但不够灵活；所有的图片都是保存在full文件夹下，不能进行分类

代码示范

1.无非类版本

# -*- coding: utf-8 -*-
import scrapy
from bmw.items import BmwItem


class BmwHcSpider(scrapy.Spider):
    name = 'bmw_hc'
    # allowed_domains = ['https://car.autohome.com.cn/pic/series/65.html']
    start_urls = ['https://car.autohome.com.cn/pic/series/65.html']

    def parse(self, response):
        uiboxs = response.xpath('//div[@class="column grid-16"]//div[@class="uibox"]')[1:]

        for uibox in uiboxs:
            category = uibox.xpath("./div[@class='uibox-title']/a/text()").get()
            urls = uibox.xpath('.//ul/li/a/img/@src').getall()
            urls = list(map(lambda x: response.urljoin(x), urls))

            item = BmwItem(category=category, image_urls=urls)
            yield item

spider文件

import scrapy


class BmwItem(scrapy.Item):
    # define the fields for your item here like:
    category = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

item

ITEM_PIPELINES = {
    # 'bmw.pipelines.BmwPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline': 1
}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')


# 核心时开启默认管道 和 路径，

setting

2.分类版本

在之前基础修改的

from scrapy.pipelines.images import ImagesPipeline
from bmw.settings import IMAGES_STORE
import os


class BmwPipeline(ImagesPipeline):  # 此时继承了scrapy自己的图片下载管道

    def get_media_requests(self, item, info):  # 覆盖父类方法（请求之前调用）
        requests_objs = super().get_media_requests(item, info)  # 发送图片请求前的方法
        for request_obj in requests_objs:
            request_obj.item = item  # 给每个request对象 赋值item
        return requests_objs
    def file_path(self, request, response=None, info=None):
        path = super().file_path(request, response, info)  # 默认下载路径

        category = request.item.get('category')  # 种类
        images_store = IMAGES_STORE  # 放置路径
        category_path = os.path.join(images_store, category)  # 种类路径

        if not category_path:  # 创建
            os.mkdir(category_path)

        image_name = path.replace('full/', '')  # 替换原路径的full路径
        image_path = os.path.join(category_path, image_name)
        return image_path

管道

ITEM_PIPELINES = {
    'bmw.pipelines.BmwPipeline': 300,
    # 'scrapy.pipelines.images.ImagesPipeline': 1
}

setting

3.全部图片下载

需要改成crawlspider的

ITEM_PIPELINES = {
    'bmw.pipelines.BmwPipeline': 300,
    # 'scrapy.pipelines.images.ImagesPipeline': 1
}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')

setting中

import scrapy


class BmwItem(scrapy.Item):
    # define the fields for your item here like:
    category = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

item

import scrapy
from bmw.items import BmwItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


class BmwHcSpider(CrawlSpider):
    name = 'bmw_hc'
    # allowed_domains = ['xxx']
    start_urls = ['https://car.autohome.com.cn/pic/series/65.html']

    link = LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/65.+')
    rules = (
        Rule(link, callback='parse_page', follow=True),

    )

    def parse_page(self, response):
        category = response.xpath("//div[@class='uibox']/div/text()").get()
        print(category)
        srcs = response.xpath('//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall()
        srcs = list(map(lambda x: response.urljoin(x.replace('t_', '')), srcs))

        yield BmwItem(category=category, image_urls=srcs)

spider

from scrapy.pipelines.images import ImagesPipeline
from bmw.settings import IMAGES_STORE
import os


class BmwPipeline(ImagesPipeline):  # 此时继承了scrapy自己的图片下载管道

    def get_media_requests(self, item, info):  # 覆盖父类方法（请求之前调用）
        requests_objs = super().get_media_requests(item, info)  # 发送图片请求前的方法
        for request_obj in requests_objs:
            request_obj.item = item  # 给每个request对象 赋值item
        return requests_objs
    def file_path(self, request, response=None, info=None):
        path = super().file_path(request, response, info)  # 默认下载路径

        category = request.item.get('category')  # 种类
        images_store = IMAGES_STORE  # 放置路径
        category_path = os.path.join(images_store, category)  # 种类路径

        if not category_path:  # 创建
            os.mkdir(category_path)

        image_name = path.replace('full/', '')  # 替换原路径的full路径
        image_path = os.path.join(category_path, image_name)
        return image_path

管道

posted @ 2019-05-10 23:04 洛丶丶丶阅读(167) 评论(0) 收藏举报

刷新页面返回顶部

洛丶丶丶

scrapy之自带图片管道下载

优势：

工作流程：

实现方式：

1.无非类版本

2.分类版本

3.全部图片下载

公告