scrapy数据写入管道

1 setting里面启动管道

ITEM_PIPELINES = {
   'ganji.pipelines.GanjiPipeline': 300,
}

2 拿到的数据通过yield返回给管道

# -*- coding: utf-8 -*-
import csv

import scrapy


class GjSpider(scrapy.Spider):
    name = 'gj'
    allowed_domains = ['ganji.com']
    start_urls = ['http://sz.ganji.com/zufang/']

    def optimizeContent(self,res):
        res = res.replace('b\'', '')
        res = res.replace('\\n', '')
        res = res.replace('\'', '')
        res = res.replace('style', 'nouse')
        res = res.replace('\.', '')
        return res

    def parse(self, response):
        print(response.url)
        houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]')

        for houst in houseList:
            title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first()
            size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first()
            chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first()
            price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first()
            address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first()
            address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first()

            item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)}
            yield item

3 pipeline文件里面写入文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv


class GanjiPipeline(object):


    def  open_spider(self,spider):
        with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp:
            writer = csv.writer(fp)  # 先传入文件句柄
            writer.writerow(['标题', '大小', '朝向', '价格', '地址'])  # 然后写入
            fp.close()

    def process_item(self, item, spider):

        with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp:
            writer = csv.writer(fp)  # 先传入文件句柄
            item['title'] = self.optimizeContent(item['title'])
            print(item['title'])
            writer.writerow([item['title'], item['size'], item['chaoxiang'], item['price'], item['address']])  # 按行写入
            fp.close()
        return item

    def optimizeContent(self,res):
        res = res.replace('b\'', '')
        res = res.replace('\\n', '')
        res = res.replace('\'', '')
        res = res.replace('style', 'nouse')
        res = res.replace('\.', '')
        return res

posted @ 2020-03-16 15:53 brady-wang 阅读(578) 评论(0) 收藏举报

刷新页面返回顶部

风行天下

天地不仁以万物为刍狗

scrapy数据写入管道

公告

风行天下

天地不仁 以万物为刍狗

scrapy数据写入管道

公告

天地不仁以万物为刍狗