scrapy当当网练习
def parse(self, response):
print('当当网')
li = response.xpath('//ul[@id="component_59"]/li')
#src,name,price有个共同的父元素li,但是对于第一个li,没有data-original,所以遍历根据li的索引判断是否为none
for item in li:
srcFirst = item.xpath('./a/img/@src')
src = item.xpath('./a/img/@data-original')
name = item.xpath('./a/img/@alt')
#获取内容
price = item.xpath(
'./p[@class="price"]/span[@class="search_now_price"]/text()')
if(src.extract_first()):
resSrc = 'http:' + src.extract_first()
else:
resSrc = 'http:' + srcFirst.extract_first()
resName = name.extract_first()
resPrice = price.extract_first()
print(resSrc,resName,resPrice)
book = ScrapyproItem(src=resSrc,name=resName,price=resPrice)
#交给pipeline
yield book
pass
settings.py
ITEM_PIPELINES = {
'scrapyPro.pipelines.ScrapyproPipeline': 300,
}
items.py
class ScrapyproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
src = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
pass
piplines.py
class ScrapyproPipeline:
def process_item(self, item, spider):
with open('book.json','a',encoding='utf-8')as fp:
fp.write(str(item))
return item
新定义一个pepeline用来下载图片:
class DangDownloadPicture:
def process_item(self, item, spider):
url = item.get('src')
name = './books/' + item.get('name') + '.jpg'
urllib.request.urlretrieve(url=url,filename=name)
return item
settings.py 301表示优先级,数字越小优先级越高
ITEM_PIPELINES = {
'scrapyPro.pipelines.DangDownloadPicture': 301,
}
下载100页的图片和json数据:
class DangSpider(scrapy.Spider):
name = 'dang'
allowed_domains = ['category.dangdang.com']
start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']
# http://category.dangdang.com/pg2-cp01.01.02.00.00.00.html
base_url = 'http://category.dangdang.com/pg'
page = 1
def parse(self, response):
print('当当网')
li = response.xpath('//ul[@id="component_59"]/li')
for item in li:
srcFirst = item.xpath('./a/img/@src')
src = item.xpath('./a/img/@data-original')
name = item.xpath('./a/img/@alt')
price = item.xpath(
'./p[@class="price"]/span[@class="search_now_price"]/text()')
if(src.extract_first()):
resSrc = 'http:' + src.extract_first()
else:
resSrc = 'http:' + srcFirst.extract_first()
resName = name.extract_first()
resPrice = price.extract_first()
print(resSrc,resName,resPrice)
book = ScrapyproItem(src=resSrc,name=resName,price=resPrice)
#交给pipeline
yield book
pass
if self.page < 100:
self.page = self.page + 1
url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html'
yield scrapy.Request(url=url,callback=self.parse)
浙公网安备 33010602011771号