下载安装驱动,先进入虚拟环境:
pip install -i https://pypi.douban.com/simple mysqlclient
安装错误 就进入这个网站:http://www.lfd.uci.edu/~gohlke/pythonlibs/#mysqlclient 下载进行安装
linux 下安装 sudo apt-get install libmysqlcilent-devsim
数据库表 设计

json配置和数据库处理
pipeline.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import codecs #用来打开文件的包,这个和open最大的区别就算是解决很多编码上的问题 import json from scrapy.exporters import JsonItemExporter from scrapy.pipelines.images import ImagesPipeline import MySQLdb import MySQLdb.cursors from twisted.enterprise import adbapi #将mysql操作变为异步操作 class ArticlespiderPipeline(object): def process_item(self, item, spider): return item ############自定义json文件 写入文件##### class JsonWithEncodeingPipeline(object): def __init__(self): self.file = codecs.open('article.json','w',encoding='utf-8') def process_item(self, item, spider): lines=json.dumps(dict(item),ensure_ascii=False)+'\n' #ensure_ascii 用于编码防止uncionde编码写入 self.file.write(lines) return item def spider_clode(self,spider): self.file.close() class MysqlPipeline(object): ''':param str host: host to connect :param str user: user to connect as :param str password: password to use :param str passwd: alias of password, for backward compatibility :param str database: database to use :param str db: alias of database, for backward compatibility :param int port: TCP/IP port to connect to :param str unix_socket: location of unix_socket to use :param dict conv: conversion dictionary, see MySQLdb.converters :param int connect_timeout:''' # 采用同步的机制写入mysql def __init__(self): self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into jobbole_article(title, url, create_date, fav_nums) VALUES (%s, %s, %s, %s) """ self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"])) self.conn.commit() pass class MysqlTwistedPipline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host=settings["MYSQL_HOST"], db=settings["MYSQL_DBNAME"], user=settings["MYSQL_USER"], passwd=settings["MYSQL_PASSWORD"], charset='utf8', cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True, ) dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): # 使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) # 处理异常 def handle_error(self, failure, item, spider): # 处理异步插入的异常 print(failure) def do_insert(self, cursor, item): # 执行具体的插入 # 根据不同的item 构建不同的sql语句并插入到mysql中 insert_sql, params = item.get_insert_sql() cursor.execute(insert_sql, params) def process_item(self,item,spider): #是使用twisted将mysql插入变成异步执行 pass ######调用scrapt提供的json export 导出json文件 class JsonItemExporterPipleline(object): # 调用scrapy提供的json export导出json文件 def __init__(self): self.file=open('articleexport.json','wb') self.exporter = JsonItemExporter(self.file,encoding='utf-8',ensure_ascii=False) self.exporter.start_exporting() #开始导出文件 def close_spider(self,spider): self.exporter.finish_exporting() self.file.close() def process_item(self,item,spider): self.exporter.export_item(item) return item class ArticleImagepiple(ImagesPipeline): def item_completed(self, results, item, info): for ok,value in results: image_file_path =value['path'] item['front_image_path']=image_file_path return item
setting设置
ITEM_PIPELINES = { # 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, # 'ArticleSpider.pipelines.JsonItemExporterPipleline':2, # 'scrapy.pipelines.images.ImagesPipeline':1, # 'ArticleSpider.pipelines.MysqlTwistedPipline':1, # 'ArticleSpider.pipelines.ArticleImagepiple':1, } #pipeline 会流经这里,后面数字越小 处理的时间越早 IMAGES_URLS_FIELD = 'front_image_url'#配置这个 就去去item找这个字段 project_dir = os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE=os.path.join(project_dir,'images') #articlespider下面的 IMAGES_MIX_HEIGHT=100 #最小高度和最小宽度 IMAGES_MIX_WIDTH=100 MYSQL_HOST = 'localhost' MYSQL_DBNAME='article_spider' MYSQL_USER='root' MYSQL_PASSWORD='root'

浙公网安备 33010602011771号