#这里只爬取第一页
items.py
import scrapy
#定义爬取数据
class InsistItem(scrapy.Item):
image_urls=scrapy.Field()
tengxun.py
import scrapy
from insist.items import InsistItem
import json
class TengxunSpider(scrapy.Spider):
name = 'tengxun'
allowed_domains = ['douyucdn.cn']
start_urls = ['http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=']
def parse(self, response):
item=InsistItem()
con=json.loads(response.body)
datas=con['data']
print(datas)
for i in datas:
item['image_urls']=[i['vertical_src']]#非常重要,由于
#{'scrapy.pipelines.images.ImagesPipeline': 301}用到的是图片的url列表,即使是一个链接也要用列表
yield item
settings.py
ITEM_PIPELINES = {
#'insist.pipelines.InsistPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_STORE='C:\\Users\\lenovo\\Desktop\\data'#图片保存地址
IMAGES_URLS_FIELD='image_urls'#保存链接的字段
pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline#导包
class SDPipeline(ImagesPipeline):
def get_media_requests(self,item,info):
image_link=item['image_urls']
yield scrapy.Request(image_link)
最后scrapy crawl tengxun
然后在所写的图片的目录中打开一个full的文件夹查看图片