scrapy使用五.一:自定义json和mysql处理数据,与使用scrapy自带模块处理
自定义json处理pipeline:使用json模块处理JSON数据
class MyPipeline(object): def __init__(self): self.file = codecs.open('article.json', 'w', encoding='utf-8') def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file.write(lines) return item def spider_closed(self, spider): self.file.close()
使用 内置JSON处理对象JsonItemExporter,处理json数据
class JsonExporterPipeline(object): def __init__(self): self.file = open('articleexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
自定义数据库:处理数据存取,阻塞型的数据写入操作
class MysqlPipeline(object): def __init__(self): self.conn = pymysql.connect('192.168.1.1', 'root', '123456', 'titlespider', charset='utf-8', use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into article(title, cteate_time, url, content) VALUES (%s, %s, %s, %s) """ self.cursor.execute(insert_sql, (item['title'], item['cteate_time'], item['url'], item['content'])) self.conn.commit() return item
使用twisted提供的数据库连接池,异步化写入,缓解写数据操作堵塞
# 首先在settings定义数据库关键字变量 MYSQL_HOST = '192.168.1.1' MYSQL_USER = 'root' MYSQL_PASSWD = '123456' MYSQL_DB = 'articlespider' # 然后编写Pipeline类 from twisted.enterprise import adbapi import pymysql import pymysql.cursors class MysqlTwistedPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): conn_dict = dict( host = settings['MYSQL_HOST'], user = settings['MYSQL_USER'], passwd = settings['MYSQL_PASSWD'], database = settings['MYSQL_DB'], charset = 'utf-8', cursorclass = pymysql.cursors.DictCursor, use_unicode = True ) dbpool = adbapi.ConnectionPool('PyMySQL', **conn_dict) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.do_insert, item) query.addErrorback(self.handle_error) return item def do_insert(self, item): insert_sql = """ insert into article(title, cteate_time, url, content) VALUES (%s, %s, %s, %s) """ self.cursor.execute(insert_sql, (item['title'], item['cteate_time'], item['url'], item['content'])) self.conn.commit() def handle_error(self, failure): print(failure)
posted on 2018-10-05 17:34 myworldworld 阅读(150) 评论(0) 收藏 举报