scrapy之自带图片管道下载
ImagesPipeline是scrapy自带的类,用来处理图片(爬取时将图片下载到本地)用的。
优势:
- 将下载图片转换成通用的JPG和RGB格式
- 避免重复下载
- 缩略图生成
- 图片大小过滤
- 异步下载
- ......
工作流程:
- 爬取一个Item,将图片的URLs放入
image_urls字段 - 从
Spider返回的Item,传递到Item Pipeline - 当
Item传递到ImagePipeline,将调用Scrapy 调度器和下载器完成image_urls中的url的调度和下载。 - 图片下载成功结束后,图片下载路径、url和校验和等信息会被填充到images字段中。
实现方式:
- 自定义pipeline,优势在于可以重写ImagePipeline类中的实现方法,可以根据情况对照片进行分类;
- 直接使用ImagePipeline类,简单但不够灵活;所有的图片都是保存在full文件夹下,不能进行分类
代码示范
1.无非类版本
# -*- coding: utf-8 -*- import scrapy from bmw.items import BmwItem class BmwHcSpider(scrapy.Spider): name = 'bmw_hc' # allowed_domains = ['https://car.autohome.com.cn/pic/series/65.html'] start_urls = ['https://car.autohome.com.cn/pic/series/65.html'] def parse(self, response): uiboxs = response.xpath('//div[@class="column grid-16"]//div[@class="uibox"]')[1:] for uibox in uiboxs: category = uibox.xpath("./div[@class='uibox-title']/a/text()").get() urls = uibox.xpath('.//ul/li/a/img/@src').getall() urls = list(map(lambda x: response.urljoin(x), urls)) item = BmwItem(category=category, image_urls=urls) yield item
import scrapy class BmwItem(scrapy.Item): # define the fields for your item here like: category = scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field()
ITEM_PIPELINES = { # 'bmw.pipelines.BmwPipeline': 300, 'scrapy.pipelines.images.ImagesPipeline': 1 } IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') # 核心时开启默认管道 和 路径,
2.分类版本
在之前基础修改的
from scrapy.pipelines.images import ImagesPipeline from bmw.settings import IMAGES_STORE import os class BmwPipeline(ImagesPipeline): # 此时继承了scrapy自己的图片下载管道 def get_media_requests(self, item, info): # 覆盖父类方法(请求之前调用) requests_objs = super().get_media_requests(item, info) # 发送图片请求前的方法 for request_obj in requests_objs: request_obj.item = item # 给每个request对象 赋值item return requests_objs def file_path(self, request, response=None, info=None): path = super().file_path(request, response, info) # 默认下载路径 category = request.item.get('category') # 种类 images_store = IMAGES_STORE # 放置路径 category_path = os.path.join(images_store, category) # 种类路径 if not category_path: # 创建 os.mkdir(category_path) image_name = path.replace('full/', '') # 替换原路径的full路径 image_path = os.path.join(category_path, image_name) return image_path
ITEM_PIPELINES = { 'bmw.pipelines.BmwPipeline': 300, # 'scrapy.pipelines.images.ImagesPipeline': 1 }
3.全部图片下载
需要改成crawlspider的
ITEM_PIPELINES = { 'bmw.pipelines.BmwPipeline': 300, # 'scrapy.pipelines.images.ImagesPipeline': 1 } IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
import scrapy class BmwItem(scrapy.Item): # define the fields for your item here like: category = scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field()
import scrapy from bmw.items import BmwItem from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class BmwHcSpider(CrawlSpider): name = 'bmw_hc' # allowed_domains = ['xxx'] start_urls = ['https://car.autohome.com.cn/pic/series/65.html'] link = LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/65.+') rules = ( Rule(link, callback='parse_page', follow=True), ) def parse_page(self, response): category = response.xpath("//div[@class='uibox']/div/text()").get() print(category) srcs = response.xpath('//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall() srcs = list(map(lambda x: response.urljoin(x.replace('t_', '')), srcs)) yield BmwItem(category=category, image_urls=srcs)
from scrapy.pipelines.images import ImagesPipeline from bmw.settings import IMAGES_STORE import os class BmwPipeline(ImagesPipeline): # 此时继承了scrapy自己的图片下载管道 def get_media_requests(self, item, info): # 覆盖父类方法(请求之前调用) requests_objs = super().get_media_requests(item, info) # 发送图片请求前的方法 for request_obj in requests_objs: request_obj.item = item # 给每个request对象 赋值item return requests_objs def file_path(self, request, response=None, info=None): path = super().file_path(request, response, info) # 默认下载路径 category = request.item.get('category') # 种类 images_store = IMAGES_STORE # 放置路径 category_path = os.path.join(images_store, category) # 种类路径 if not category_path: # 创建 os.mkdir(category_path) image_name = path.replace('full/', '') # 替换原路径的full路径 image_path = os.path.join(category_path, image_name) return image_path

浙公网安备 33010602011771号