scrapy使用五:scrapy配置mysql、mongodb和redis
一、配置MYSQL
修改settings.py
# start MySQL database configure setting MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'cnblogsdb' MYSQL_USER = 'root' MYSQL_PASSWD = 'root' # end of MySQL database configure setting
修改pipelines.py
[root@bogon cnblogs]# more pipelines.py # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy import signals import json import codecs from twisted.enterprise import adbapi from datetime import datetime from hashlib import md5 import MySQLdb import MySQLdb.cursors class JsonWithEncodingCnblogsPipeline(object): def __init__(self): self.file = codecs.open('cnblogs.json', 'w', encoding='utf-8') def process_item(self, item, spider): line = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(line) return item def spider_closed(self, spider): self.file.close() class MySQLStoreCnblogsPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbargs = dict( host=settings['MYSQL_HOST'], db=settings['MYSQL_DBNAME'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASSWD'], charset='utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode= True, ) dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs) return cls(dbpool) #pipeline默认调用 def process_item(self, item, spider): d = self.dbpool.runInteraction(self._do_upinsert, item, spider) d.addErrback(self._handle_error, item, spider) d.addBoth(lambda _: item) return d #将每行更新或写入数据库中 def _do_upinsert(self, conn, item, spider): linkmd5id = self._get_linkmd5id(item) #print linkmd5id now = datetime.utcnow().replace(microsecond=0).isoformat(' ') conn.execute(""" select 1 from cnblogsinfo where linkmd5id = %s """, (linkmd5id, )) ret = conn.fetchone() if ret: conn.execute(""" update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s """, (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)) #print """ # update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id) else: conn.execute(""" insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated) values(%s, %s, %s, %s, %s, %s) """, (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)) #print """ # insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated) # values(%s, %s, %s, %s, %s, %s) #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now) #获取url的md5编码 def _get_linkmd5id(self, item): #url进行md5处理,为避免重复采集设计 return md5(item['link']).hexdigest() #异常处理 def _handle_error(self, failue, item, spider): log.err(failure)
修改setting.py配置文件,添加MySQLStoreCnblogsPipeline的支持
ITEM_PIPELINES = { 'cnblogs.pipelines.JsonWithEncodingCnblogsPipeline': 300, 'cnblogs.pipelines.MySQLStoreCnblogsPipeline': 300, }
二、配置mongoDB
安装pymongo
pip install pymongo
在settings.py中配置MongoDB的IP地址、端口号、数据记录名称,可以实现方例的更换MongoDB的数据库信息。
MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017 MONGODB_DBNAME = '数据库名称' MONGODB_DOCNAME = "集合名称"
在settings.py引用pipelines.py,从而使pipelines生效
ITEM_PIPELINES = ['novelspider.pipelines.NovespiderPipeline']
在pipelines.py中应用mongoDB:
from scrapy.conf import settings import pymongo class NovespiderPipeline(object): def __init__(self): # 连接mongoDB host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbName = settings['MONGODB_DBNAME'] table = settings['MONGODB_DOCNAME'] client = pymongo.MongoClient(host=host, port=port) db = client[dbName] self.table = db[table] def process_item(self, item, spider): bookInfo = dict(item) self.table.insert(bookInfo) return item
示例:爬取盗墓笔记九本书及各章节
1.创建项目、创建爬虫
scrapy startproject novespider cd novespider scrapy genspider novspider ""
2.明确需求,编写items.py
打开网址:http://www.daomubiji.com/
查看页面结构: 每本书一个table
import scrapy class NovespiderItem(scrapy.Item): bookName = scrapy.Field() bookTitle = scrapy.Field() chapterNum = scrapy.Field() chapterName = scrapy.Field() chapterURL = scrapy.Field()
3.编写爬虫文件,spiders/novspider.py
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from novespider.items import NovespiderItem class NovspiderSpider(scrapy.Spider): name = "novspider" allowed_domains = ["daomubiji.com"] start_urls = ['http://www.daomubiji.com'] def parse(self, response): selector = Selector(response) table = selector.xpath("//table") # 新版scrapy,直接使用response.xpath("//table") for each in table: bookName = each.xpath("tr/td[@colspan='3']/center/h2/text()").extract()[0] content = each.xpath("tr/td/a/text()").extract() url = each.xpath("/tr/td/a/@href").extract for i in range(len(url)): item = NovespiderItem() # 书名、章节的url item['bookName'] = bookName item['chaperURL'] = url[1] try: # 书的title、章节的编号 item['bookTitle'] = content[i].split(' ')[0] item['chaperNum'] = content[i].split(' ')[1] except Exception as e: continue try: item['chapterName'] = content[i].split(' ')[2] except Exception as e: item['chapterName'] = content[i].split(' ')[1[-3:]] yield item
三、配置redis
在settings.py配置redis
安装scrapy_redis
pip install scrapy_redis
SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True SCHEDULER_QUEUE_CLASS = scrapy_redis.queue.SpiderPriorityQueue REDIS_URL = None REDIS_HOST = "127.0.0.1" REDIS_PORT = 6379
使用上面的项目和爬虫
在items.py增加text,小说正文字段text
import scrapy class NovespiderItem(scrapy.Item): bookName = scrapy.Field() bookTitle = scrapy.Field() chapterNum = scrapy.Field() chapterName = scrapy.Field() chapterURL = scrapy.Field() text = scrapy.Field()
编写爬虫
# -*- coding: utf-8 -*- import scrapy from scrapy_redis.spiders import RedisSpider from scrapy.selector import Selector from novespider.items import NovespiderItem import re class NovspiderSpider(RedisSpider): name = "novspider" start_urls = ['http://www.daomubiji.com/qi-xing-lu-wang-01.html'] def parse(self, response): selector = Selector(response) table = selector.xpath("//table") # 新版scrapy,直接使用response.xpath("//table") for each in table: bookName = each.xpath("tr/td[@colspan='3']/center/h2/text()").extract()[0] content = each.xpath("tr/td/a/text()").extract() url = each.xpath("/tr/td/a/@href").extract for i in range(len(url)): item = NovespiderItem() # 书名、章节的url item['bookName'] = bookName item['chaperURL'] = url[1] try: # 书的title、章节的编号 item['bookTitle'] = content[i].split(' ')[0] item['chaperNum'] = content[i].split(' ')[1] except Exception as e: continue try: item['chapterName'] = content[i].split(' ')[2] except Exception as e: item['chapterName'] = content[i].split(' ')[1[-3:]] yield scrapy.Request(url[i], callback="parseContent", meta={'item':item}) def parseContent(self, response): seletor = Selector(response) item = response.meta['item'] html = seletor.xpath('//div[@class="content"]').extract()[0] textField = re.search('<div> style="clear:both"></div>(.*>)<div', html, re.S).group() text = re.findall('<p>(.*)></p>', textField, re.S) fulltext = "" for each in text: fulltext += each item['text'] = fulltext yield item
posted on 2018-10-05 15:53 myworldworld 阅读(478) 评论(0) 收藏 举报