scrapy使用五.一:自定义json和mysql处理数据,与使用scrapy自带模块处理

自定义json处理pipeline:使用json模块处理JSON数据

class MyPipeline(object):

    def __init__(self):
        self.file = codecs.open('article.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        lines = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(lines)
        return item

    def spider_closed(self, spider):
        self.file.close()

 

使用 内置JSON处理对象JsonItemExporter,处理json数据

class JsonExporterPipeline(object):

    def __init__(self):
        self.file = open('articleexport.json', 'wb')
        self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

 

自定义数据库:处理数据存取,阻塞型的数据写入操作

class MysqlPipeline(object):

    def __init__(self):
        self.conn = pymysql.connect('192.168.1.1', 'root', '123456', 'titlespider',
                                    charset='utf-8', use_unicode=True)
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        insert_sql = """
            insert into article(title, cteate_time, url, content)
            VALUES (%s, %s, %s, %s)
        """
        self.cursor.execute(insert_sql, (item['title'], item['cteate_time'], item['url'], item['content']))
        self.conn.commit()
        return item

使用twisted提供的数据库连接池,异步化写入,缓解写数据操作堵塞

# 首先在settings定义数据库关键字变量

    MYSQL_HOST = '192.168.1.1'
    MYSQL_USER = 'root'
    MYSQL_PASSWD = '123456'
    MYSQL_DB = 'articlespider'

# 然后编写Pipeline类

    from twisted.enterprise import adbapi
    import pymysql
    import pymysql.cursors      

    class MysqlTwistedPipeline(object):

        def __init__(self, dbpool):
            self.dbpool = dbpool

        @classmethod
        def from_settings(cls, settings):
            conn_dict = dict(
                host = settings['MYSQL_HOST'],
                user = settings['MYSQL_USER'],
                passwd = settings['MYSQL_PASSWD'],
                database = settings['MYSQL_DB'],
                charset = 'utf-8',
                cursorclass = pymysql.cursors.DictCursor,
                use_unicode = True
            )
            dbpool = adbapi.ConnectionPool('PyMySQL', **conn_dict)
            return cls(dbpool)

        def process_item(self, item, spider):
            query = self.dbpool.runInteraction(self.do_insert, item)
            query.addErrorback(self.handle_error)
            return item

        def do_insert(self, item):
            insert_sql = """
                        insert into article(title, cteate_time, url, content)
                        VALUES (%s, %s, %s, %s)
                    """
            self.cursor.execute(insert_sql, (item['title'], item['cteate_time'], item['url'], item['content']))
            self.conn.commit()

        def handle_error(self, failure):
            print(failure)

 

posted on 2018-10-05 17:34  myworldworld  阅读(150)  评论(0)    收藏  举报

导航