scrapy下载一波头像

最近手机中收藏的头像不够用了,直接用scrapy下载一波吧

items.py

class ProfilesItem(scrapy.Item):
    id = scrapy.Field()
    url = scrapy.Field()

spider文件


import scrapy
from urllib.parse import urlencode
from time import time
from scrapy import Request
import json
from profiles.items import ProfilesItem

class ImageSpider(scrapy.Spider):
    name = 'image'
    allowed_domains = ['www.duitang.com']

    def start_requests(self):
        base_url = 'https://www.duitang.com/napi/blog/list/by_filter_id/?'
        data = {'include_fields': 'top_comments,is_root,source_link,item,buyable,root_id,status,like_count,sender,album,reply_count',
                'filter_id': '头像'}
        for i in range(1, 50):
            data['start'] = i * 24
            data['_'] = int(time()*1000)
            params = urlencode(data)
            url = base_url+params
            yield Request(url=url, callback=self.parse)

    def parse(self, response):
        result = json.loads(response.text)
        for image in result.get('data').get('object_list'):
            item = ProfilesItem()
            item['id'] = image.get('album').get('id')
            item['url'] = image.get('album').get('covers')[0]
            yield item

pipelines中重写一下Imagepipeline

from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem

class ImagePipeline(ImagesPipeline):
    def file_path(self, request, response=None, info=None):
        url = request.url
        file_name = url.split('/')[-1]
        return file_name

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem('Image Downloaded Failed')
        return item

    def get_media_requests(self, item, info):
        yield Request(item['url'])

最后setting中添加一些设置,其他的不用修改

IMAGES_STORE = 'D:\\pic\\profiles'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
   'profiles.pipelines.ImagePipeline': 300,
}

 

posted @ 2020-07-14 17:04  fruhling  阅读(172)  评论(0)    收藏  举报