scrapy当当网练习

    def parse(self, response):
        print('当当网')
        li = response.xpath('//ul[@id="component_59"]/li')
　　　　　　#src,name,price有个共同的父元素li,但是对于第一个li,没有data-original,所以遍历根据li的索引判断是否为none
        for item in li:
            srcFirst = item.xpath('./a/img/@src')
            src = item.xpath('./a/img/@data-original')
            name = item.xpath('./a/img/@alt')
　　　　　　　　#获取内容
            price = item.xpath(
                './p[@class="price"]/span[@class="search_now_price"]/text()')
            if(src.extract_first()):
                resSrc = 'http:' + src.extract_first()
            else:
                resSrc = 'http:' + srcFirst.extract_first()

            resName = name.extract_first()
            resPrice = price.extract_first()
            print(resSrc,resName,resPrice)
            book = ScrapyproItem(src=resSrc,name=resName,price=resPrice)
            #交给pipeline
            yield book
        pass

　　settings.py

ITEM_PIPELINES = {
   'scrapyPro.pipelines.ScrapyproPipeline': 300,
}

　　items.py

class ScrapyproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    src = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()
    pass

　　piplines.py

class ScrapyproPipeline:
    def process_item(self, item, spider):
        with open('book.json','a',encoding='utf-8')as fp:
            fp.write(str(item))
        return item

　　新定义一个pepeline用来下载图片：

class DangDownloadPicture:

    def process_item(self, item, spider):
        url = item.get('src')
        name = './books/' + item.get('name') + '.jpg'
        urllib.request.urlretrieve(url=url,filename=name)

        return item

　　settings.py 301表示优先级，数字越小优先级越高

ITEM_PIPELINES = {
   'scrapyPro.pipelines.DangDownloadPicture': 301,
}

　　下载100页的图片和json数据：

class DangSpider(scrapy.Spider):
    name = 'dang'
    allowed_domains = ['category.dangdang.com']
    start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']
    # http://category.dangdang.com/pg2-cp01.01.02.00.00.00.html
    base_url = 'http://category.dangdang.com/pg'
    page = 1
    def parse(self, response):
        print('当当网')
        li = response.xpath('//ul[@id="component_59"]/li')
        for item in li:
            srcFirst = item.xpath('./a/img/@src')
            src = item.xpath('./a/img/@data-original')
            name = item.xpath('./a/img/@alt')
            price = item.xpath(
                './p[@class="price"]/span[@class="search_now_price"]/text()')
            if(src.extract_first()):
                resSrc = 'http:' + src.extract_first()
            else:
                resSrc = 'http:' + srcFirst.extract_first()

            resName = name.extract_first()
            resPrice = price.extract_first()
            print(resSrc,resName,resPrice)
            book = ScrapyproItem(src=resSrc,name=resName,price=resPrice)
            #交给pipeline
            yield book
        pass

        if self.page < 100:
            self.page = self.page + 1
            url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html'
            yield scrapy.Request(url=url,callback=self.parse)

posted @ 2023-10-04 16:13 sgj191024 阅读(29) 评论(0) 收藏举报

刷新页面返回顶部

scrapy当当网练习

公告