scrapy持久化
官网:https://docs.scrapy.org/en/latest/
a、pipeline的四个作用
官方
1、持久化
2、去重
3、清理HTML数据
4、验证爬取的数据,检测爬取的字段
我的认知
持久化、去重、数据清洗(取空格,换行符,引号等)
b、流程
1、yield item
2、process_items
return item # 添加成功
raise DropItem # 添加失败
注意:item类可以定义 table_name = "表名"。用于mysql的储存
一、磁盘文件
1、基于终端指令的持久化存储
item
scrapy crawl quote -o qutoes.json
2、基于管道的持久化存储
保存图片
二、数据库
1、mysql(注意:自定义ip池可以参考这个作为修改)
import pymysql def MysqlPipeline(object): def __init__(self, host, database, user, password, port): self.host = host self.database = database self.user = user self.password = password self.port = port @classmethod def from_crawler(self, crawler): return cls( host = crawler.settings.get('MYSQL_HOST') database = crawler.settings.get('MYSQL_DATABASE') user = crawler.settings.get('MYSQL_USER') password= crawler.settings.get('MYSQL_PASSWORD') port = crawler.settings.get('MYSQL_PORT') ) def open_spider(self, spider): self.conn = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf-8', port=self.port) self.cursor = self.conn.cursor() def process_item(self, item, spider): data = dict(item) keys = ','.join(data.keys()) values = ','.join(['%s']*len(data)) sql = 'insert into %s (%s) values (%s)' % (table, keys, values) try: self.cursor.execute(sql, tuple(data.values())) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self): self.cursor.close() self.conn.close()
2、mogodb
import Pymongo class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_url = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_url = crawler.settings.get('MONGO_URI'), mongo_db = crawler.settings.get('MONGO_DB') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): try: self.db[collection].insert(dict(item)) except Exception as e: print(e) return item def close_spider(self, spider): self.client.close()
4、redis
import redis import json class FirstprojectPipeline(object): def __init__(self, redis_host, redis_port, redis_pwd): self.redis_host = redis_host self.redis_port = redis_port self.redis_pwd = redis_pwd @classmethod def from_crawler(cls, crawler): return cls( redis_host = crawler.settings.get('REDIS_HOST'), redis_port = crawler.settings.get('REDIS_PORT'), redis_pwd = crawler.settings.get('REDIS_PWD') ) def open_spider(self, spider): print("start spider") self.conn = redis.Redis(host=self.redis_host, port=self.redis_port, password=self.redis_pwd) def process_item(self, item, spider): my_dict = { "author": item['author'], 'content': item['content'], } # print(my_dict) try: self.conn.rpush("my_data", json.dumps(my_dict)) except Exception as e: print(e) return item
或
import redis from scrapy.exceptions import DropItem class FirstprojectPipeline(object): def __init__(self, redis_host, redis_port, redis_pwd): self.redis_host = redis_host self.redis_port = redis_port self.redis_pwd = redis_pwd @classmethod def from_crawler(cls, crawler): return cls( redis_host=crawler.settings.get('REDIS_HOST'), redis_port=crawler.settings.get('REDIS_PORT'), redis_pwd=crawler.settings.get('REDIS_PWD') ) def open_spider(self, spider): self.conn = redis.Redis(host=self.redis_host, port=self.redis_port, password=self.redis_pwd) def process_item(self, item, spider):
# 集合 if self.conn.sadd(spider.name, item['name']): return item raise DropItem

浙公网安备 33010602011771号