scrapy下载一波头像
最近手机中收藏的头像不够用了,直接用scrapy下载一波吧
items.py
class ProfilesItem(scrapy.Item): id = scrapy.Field() url = scrapy.Field()
spider文件
import scrapy
from urllib.parse import urlencode
from time import time
from scrapy import Request
import json
from profiles.items import ProfilesItem
class ImageSpider(scrapy.Spider): name = 'image' allowed_domains = ['www.duitang.com'] def start_requests(self): base_url = 'https://www.duitang.com/napi/blog/list/by_filter_id/?' data = {'include_fields': 'top_comments,is_root,source_link,item,buyable,root_id,status,like_count,sender,album,reply_count', 'filter_id': '头像'} for i in range(1, 50): data['start'] = i * 24 data['_'] = int(time()*1000) params = urlencode(data) url = base_url+params yield Request(url=url, callback=self.parse) def parse(self, response): result = json.loads(response.text) for image in result.get('data').get('object_list'): item = ProfilesItem() item['id'] = image.get('album').get('id') item['url'] = image.get('album').get('covers')[0] yield item
pipelines中重写一下Imagepipeline
from scrapy.pipelines.images import ImagesPipeline from scrapy import Request from scrapy.exceptions import DropItem class ImagePipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): url = request.url file_name = url.split('/')[-1] return file_name def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem('Image Downloaded Failed') return item def get_media_requests(self, item, info): yield Request(item['url'])
最后setting中添加一些设置,其他的不用修改
IMAGES_STORE = 'D:\\pic\\profiles' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36' DOWNLOAD_DELAY = 1 ITEM_PIPELINES = { 'profiles.pipelines.ImagePipeline': 300, }

浙公网安备 33010602011771号