scrapy文件管道
安装scrapy
pip install scrapy
新建项目
(python36) E:\www>scrapy startproject fileDownload New Scrapy project 'fileDownload', using template directory 'c:\users\brady\.conda\envs\python36\lib\site-packages\scrapy\templates\project', created in: E:\www\fileDownload You can start your first spider with: cd fileDownload scrapy genspider example example.com (python36) E:\www>
(python36) E:\www>scrapy startproject fileDownload
New Scrapy project 'fileDownload', using template directory 'c:\users\brady\.conda\envs\python36\lib\site-packages\scrapy\templates\project', created in:
    E:\www\fileDownload
You can start your first spider with:
    cd fileDownload
    scrapy genspider example example.com
(python36) E:\www>
编辑爬虫提取内容
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from  fileDownload.items import  FiledownloadItem
class PexelsSpider(CrawlSpider):
    name = 'pexels'
    allowed_domains = ['www.pexels.com']
    start_urls = ['https://www.pexels.com/photo/white-concrete-building-2559175/']
    rules = (
        Rule(LinkExtractor(allow=r'/photo/'), callback='parse_item', follow=True),
    )
    def parse_item(self, response):
        print(response.url)
        url = response.xpath("//img[contains(@src,'photos')]/@src").extract()
        item = FiledownloadItem()
        try:
            item['file_urls'] = url
            print("爬取到图片列表 " + url)
            yield item
        except Exception as  e:
            print(str(e))
配置item
class FiledownloadItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    file_urls = scrapy.Field()
setting.py
启用文件管道
'scrapy.pipelines.files.FilesPipeline':2 文件管道
FILES_STORE='' //存储路径
item里面
file_urls = scrapy.Field()
files = scrapy.field()
爬虫里面 改为file_urls参数传递到管道
重写文件管道 保存文件名为图片原名
pipelines.php里面 新建自己图片管道,继承图片管道
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.files import  FilesPipeline
class FiledownloadPipeline(object):
    def process_item(self, item, spider):
        tmp = item['file_urls']
        item['file_urls'] = []
        for i in tmp:
            if "?" in i:
                item['file_urls'].append(i.split('?')[0])
            else:
                item['file_urls'].append(i)
        print(item)
        return item
class  MyFilesPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None):
        file_path = request.url
        file_path = file_path.split('/')[-1]
        print("下载图片"+ file_path)
        return 'full/%s' % (file_path)
setting.py 改为启用自己文件管道
ITEM_PIPELINES = {
    'fileDownload.pipelines.FiledownloadPipeline': 1,
    'fileDownload.pipelines.MyFilesPipeline': 2,
    #'scrapy.pipelines.files.FilesPipeline':2
}
获取套图
# -*- coding: utf-8 -*-
from time import sleep
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class AngelSpider(CrawlSpider):
    name = 'angel'
    allowed_domains = ['angelimg.spbeen.com']
    start_urls = ['http://angelimg.spbeen.com/']
    base_url = "http://angelimg.spbeen.com"
    rules = (
        Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/\d+$'), callback='parse_item', follow=False),
    )
    def parse_item(self, response):
        item = response.meta.get('item',False)
        if item:
            pass
        else:
            item = {}
            item['files'] = []
            item['file_urls'] = []
        print(response.url)
        img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()
        item['file_urls'].append(img_url)
        # 如果有下一页 请求下一页,没有数据丢回管道
        next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()
        if next_url:
            next_url = self.base_url + next_url
            yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})
        else:
            print(item)
            yield item
    def parse_next_response(self,response,):
        item = response.meta.get('item')
        print(item,response.url)
github地址
https://github.com/brady-wang/spider-fileDownload

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号