scrapy 图片下载设置

scrapy 设置图片下载

1 setting.py配置

ITEM_PIPELINES = {
         # 'img_spider.pipelines.ImgSpiderPipeline': 300,
         # 图片处理的管道中间件
       'scrapy.pipelines.images.ImagesPipeline': 300
       }
指定图片存储路径(该路径要提前创建好)
IMAGES_STORE = "images"
指定图片存储时的缩略图和大图
IMAGES_THUMBS = {
   "small": (50, 50),
   "big": (1000, 1000)
}

2 360图片爬虫应用

import scrapy
import json
class SotuSpider(scrapy.Spider):
    name = 'sotu'
    allowed_domains = ['so.com']
    start_urls = ['https://image.so.com/zjl?ch=beauty&sn={}&listtype=new&temp=1']
    page_sn = 1
    MAX_COUNT = 200

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url.format(self.page_sn), callback=self.parse, encoding="UTF-8")

    def parse(self, response, **kwargs):
        解析响应中的URL地址
        images_json = json.loads(response.body.decode("UTF-8"))
        images_url_list = [img.get('qhimg_url') for img in images_json.get("list")]

        将图片交给管道--图片中间件进行处理
        yield {"image_urls": images_url_list}

        更新page_sn(图片搜索起点id)
        if images_json.get("count") > 0 and SotuSpider.page_sn < self.MAX_COUNT:
            SotuSpider.page_sn += images_json.get("count")
            yield scrapy.Request(url=self.start_urls[0].format(SotuSpider.page_sn),
                             callback=self.parse, encoding="UTF-8")

posted @ 2020-10-20 22:16  小杜打醋尢买布  阅读(193)  评论(0)    收藏  举报