scrapy使用四:使用手机app抓取某站图片
1.创建项目、爬虫的脚本:
from scrapy import cmdline cmdline.execute("scrapy startproject douyu".split()) cmdline.execute("cd douyu".split()) cmdline.execute('scrapy genspider douyucdn'.split())
2.打开网站,得到json字符串:http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=
3.明确需求,编写items.py
import scrapy class DouyuItem(scrapy.Item): # 主播名字 nickname = scrapy.Field() # 主播照片 imagelink = scrapy.Field()
4.编写爬虫文件,douyucdn.py
# -*- coding: utf-8 -*- import scrapy import json from douyu.items import DouyuItem class DouyucdnSpider(scrapy.Spider): name = "douyucdn" allowed_domains = ["'douyucdn.cn'"] base_url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=" offset = 0 start_urls = [base_url + str(offset)] def parse(self, response): data_list = json.loads(response.body)['data'] # 最后一页的下页,data将为空列表 if not data_list: return for data in data_list: nickname = data.get("nickname") vertical_src = data.get("vertical_src") item = DouyuItem() item['nickname'] = nickname if nickname else "" item['imagelink'] = vertical_src if vertical_src else "" yield item # 构造下页请求scrapy.Request对象 self.offset += 20 yield scrapy.Request(self.base_url + str(self.offset, callable=self.parse))
5.编写pipelines.py
from scrapy.pipelines.images import ImagesPipeline import scrapy import os from douyu.settings import IMAGES_STORE as images_store # 继承自用于处理图片的类ImagesPipeline class DouyuPipeline(ImagesPipeline): # 重写父类的get_media_requests方法 # 接收item,发送图片处理请求 def get_media_requests(self, item, info): image_link = item['imagelink'] yield scrapy.Request(image_link) # 自定义文件名:重写item_completed方法 # results的结构:[(True, {"url": 图片地址, "path": 存放图片的路径,如'full/sdf23423sdf.jpg',"checksum":文件的md5码})] def item_completed(self, results, item, info): # 取出results中的图片路径 image_path = [x["path"] for ok, x in results if ok] if image_path: os.rename(images_store + image_path[0], images_store + item['nickname'] + '.jpg')
return item
6.编写setttings.py
# 修改settings.py # 开启管道 ITEM_PIPELINES = { 'douyu.pipelines.DouyuPipelin': 300 } # 配置存储图片的位置 IMAGES_STORE = "/Users/administrators/Desktop/douyu/Images" # 配置手机 useragent USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4" # 禁用robots协议 ROBOTSTXT_OBEY = False
posted on 2018-10-05 14:16 myworldworld 阅读(275) 评论(0) 收藏 举报