一、保存数据格式为json
maintian.py
# -*- coding: utf-8 -*- import scrapy from MainTian.items import MaintianItem class MaintianSpider(scrapy.Spider): """ 爬虫名字,可以根据name运行项目 1.scrapy crawl maitian 第二种运行方式运行这个maintian.py文件:scrapy runspider maintian.py """ name = 'maitian' # 爬取的域名范围 allowed_domains = ['maitian.cn'] # 开始的url start_urls = ['http://bj.maitian.cn/zfall/R2C55'] # 解析方法 def parse(self, response): item = MaintianItem() item['title'] = response.xpath('/html/body/section[2]/div[2]/div[2]/ul/li[1]/div[2]/h1/a/text()').extract()[ 0].strip() item['price'] = \ response.xpath('/html/body/section[2]/div[2]/div[2]/ul/li[1]/div[2]/div/ol/strong/span/text()').extract()[ 0].strip() print(item) # 把数据返回给管道 return item
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class MaintianItem(scrapy.Item): """ 定义数据格式 """ # name = scrapy.Field() title = scrapy.Field() price = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # JSON导出器,导出来的json文件 from scrapy.exporters import JsonItemExporter class MaintianPipeline(object): def process_item(self, item, spider): """ item值是方法def parse(self, response):return item返回的 :param item: :param spider: :return: """ # 1.写入的文件对象 flie = open('maitian.json', 'wb') # 2.创建 导出器 exporter = JsonItemExporter(flie) # 3.开启 导出器 exporter.start_exporting() # 4.导出数据 exporter.export_item(item) # 5.关闭 导出器 exporter.finish_exporting() # 6.关闭文件 flie.close() return item

浙公网安备 33010602011771号