scrapy数据写入管道
1 setting里面启动管道
ITEM_PIPELINES = {
'ganji.pipelines.GanjiPipeline': 300,
}
2 拿到的数据通过yield返回给管道
# -*- coding: utf-8 -*- import csv import scrapy class GjSpider(scrapy.Spider): name = 'gj' allowed_domains = ['ganji.com'] start_urls = ['http://sz.ganji.com/zufang/'] def optimizeContent(self,res): res = res.replace('b\'', '') res = res.replace('\\n', '') res = res.replace('\'', '') res = res.replace('style', 'nouse') res = res.replace('\.', '') return res def parse(self, response): print(response.url) houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]') for houst in houseList: title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first() size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first() chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first() price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first() address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first() address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first() item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)} yield item
3 pipeline文件里面写入文件
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import csv class GanjiPipeline(object): def open_spider(self,spider): with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp: writer = csv.writer(fp) # 先传入文件句柄 writer.writerow(['标题', '大小', '朝向', '价格', '地址']) # 然后写入 fp.close() def process_item(self, item, spider): with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp: writer = csv.writer(fp) # 先传入文件句柄 item['title'] = self.optimizeContent(item['title']) print(item['title']) writer.writerow([item['title'], item['size'], item['chaoxiang'], item['price'], item['address']]) # 按行写入 fp.close() return item def optimizeContent(self,res): res = res.replace('b\'', '') res = res.replace('\\n', '') res = res.replace('\'', '') res = res.replace('style', 'nouse') res = res.replace('\.', '') return res

浙公网安备 33010602011771号