scrapy使用五:scrapy配置mysql、mongodb和redis

一、配置MYSQL

修改settings.py

# start MySQL database configure setting
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'cnblogsdb'
MYSQL_USER = 'root'
MYSQL_PASSWD = 'root'
# end of MySQL database configure setting

 

修改pipelines.py

[root@bogon cnblogs]# more pipelines.py
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy import signals
import json
import codecs
from twisted.enterprise import adbapi
from datetime import datetime
from hashlib import md5
import MySQLdb
import MySQLdb.cursors

class JsonWithEncodingCnblogsPipeline(object):
    def __init__(self):
        self.file = codecs.open('cnblogs.json', 'w', encoding='utf-8')
    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item
    def spider_closed(self, spider):
        self.file.close()

class MySQLStoreCnblogsPipeline(object):
    def __init__(self, dbpool):
        self.dbpool = dbpool
    
    @classmethod
    def from_settings(cls, settings):
        dbargs = dict(
            host=settings['MYSQL_HOST'],
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            charset='utf8',
            cursorclass = MySQLdb.cursors.DictCursor,
            use_unicode= True,
        )
        dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
        return cls(dbpool)

    #pipeline默认调用
    def process_item(self, item, spider):
        d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
        d.addErrback(self._handle_error, item, spider)
        d.addBoth(lambda _: item)
        return d
    #将每行更新或写入数据库中
    def _do_upinsert(self, conn, item, spider):
        linkmd5id = self._get_linkmd5id(item)
        #print linkmd5id
        now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
        conn.execute("""
                select 1 from cnblogsinfo where linkmd5id = %s
        """, (linkmd5id, ))
        ret = conn.fetchone()

        if ret:
            conn.execute("""
                update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
            """, (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id))
            #print """
            #    update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
            #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
        else:
            conn.execute("""
                insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated) 
                values(%s, %s, %s, %s, %s, %s)
            """, (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now))
            #print """
            #    insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
            #    values(%s, %s, %s, %s, %s, %s)
            #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
    #获取url的md5编码
    def _get_linkmd5id(self, item):
        #url进行md5处理,为避免重复采集设计
        return md5(item['link']).hexdigest()
    #异常处理
    def _handle_error(self, failue, item, spider):
        log.err(failure)

 

修改setting.py配置文件,添加MySQLStoreCnblogsPipeline的支持

ITEM_PIPELINES = {
    'cnblogs.pipelines.JsonWithEncodingCnblogsPipeline': 300,
    'cnblogs.pipelines.MySQLStoreCnblogsPipeline': 300,
}

 

二、配置mongoDB

安装pymongo

pip install pymongo

 在settings.py中配置MongoDB的IP地址、端口号、数据记录名称,可以实现方例的更换MongoDB的数据库信息。

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = '数据库名称'
MONGODB_DOCNAME = "集合名称"

 在settings.py引用pipelines.py,从而使pipelines生效

ITEM_PIPELINES = ['novelspider.pipelines.NovespiderPipeline']

 在pipelines.py中应用mongoDB:

from scrapy.conf import settings
import pymongo
class NovespiderPipeline(object):
    def __init__(self):
        # 连接mongoDB
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbName = settings['MONGODB_DBNAME']
        table = settings['MONGODB_DOCNAME']
        client = pymongo.MongoClient(host=host, port=port)
        db = client[dbName]
        self.table = db[table]
        
    def process_item(self, item, spider):
        bookInfo = dict(item)
        self.table.insert(bookInfo)
        return item

 示例:爬取盗墓笔记九本书及各章节

1.创建项目、创建爬虫

scrapy startproject novespider
cd novespider
scrapy genspider novspider ""

 

2.明确需求,编写items.py

打开网址:http://www.daomubiji.com/

查看页面结构: 每本书一个table

import scrapy
class NovespiderItem(scrapy.Item):
    bookName = scrapy.Field()
    bookTitle = scrapy.Field()
    chapterNum = scrapy.Field()
    chapterName = scrapy.Field()
    chapterURL = scrapy.Field()

 

3.编写爬虫文件,spiders/novspider.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from novespider.items import NovespiderItem

class NovspiderSpider(scrapy.Spider):
    name = "novspider"
    allowed_domains = ["daomubiji.com"]
    start_urls = ['http://www.daomubiji.com']

    def parse(self, response):
        selector = Selector(response)
        table = selector.xpath("//table")
        # 新版scrapy,直接使用response.xpath("//table")
        for each in table:
            bookName = each.xpath("tr/td[@colspan='3']/center/h2/text()").extract()[0]
            content = each.xpath("tr/td/a/text()").extract()
            url = each.xpath("/tr/td/a/@href").extract
            for i in range(len(url)):
                item = NovespiderItem()
                # 书名、章节的url
                item['bookName'] = bookName
                item['chaperURL'] = url[1]
                try:
                    # 书的title、章节的编号
                    item['bookTitle'] = content[i].split(' ')[0]
                    item['chaperNum'] = content[i].split(' ')[1]
                except Exception as e:
                    continue
                try:
                    item['chapterName'] = content[i].split(' ')[2]
                except Exception as e:
                    item['chapterName'] = content[i].split(' ')[1[-3:]]
                yield item

 

三、配置redis

在settings.py配置redis

安装scrapy_redis

pip install scrapy_redis
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = scrapy_redis.queue.SpiderPriorityQueue
REDIS_URL = None
REDIS_HOST = "127.0.0.1"
REDIS_PORT = 6379

 使用上面的项目和爬虫

 

在items.py增加text,小说正文字段text

import scrapy
class NovespiderItem(scrapy.Item):
    bookName = scrapy.Field()
    bookTitle = scrapy.Field()
    chapterNum = scrapy.Field()
    chapterName = scrapy.Field()
    chapterURL = scrapy.Field()
    text = scrapy.Field()

 

编写爬虫

# -*- coding: utf-8 -*-
import scrapy
from scrapy_redis.spiders import RedisSpider
from scrapy.selector import Selector
from novespider.items import NovespiderItem
import re

class NovspiderSpider(RedisSpider):
    name = "novspider"
    start_urls = ['http://www.daomubiji.com/qi-xing-lu-wang-01.html']

    def parse(self, response):
        selector = Selector(response)
        table = selector.xpath("//table")
        # 新版scrapy,直接使用response.xpath("//table")
        for each in table:
            bookName = each.xpath("tr/td[@colspan='3']/center/h2/text()").extract()[0]
            content = each.xpath("tr/td/a/text()").extract()
            url = each.xpath("/tr/td/a/@href").extract
            for i in range(len(url)):
                item = NovespiderItem()
                # 书名、章节的url
                item['bookName'] = bookName
                item['chaperURL'] = url[1]
                try:
                    # 书的title、章节的编号
                    item['bookTitle'] = content[i].split(' ')[0]
                    item['chaperNum'] = content[i].split(' ')[1]
                except Exception as e:
                    continue
                try:
                    item['chapterName'] = content[i].split(' ')[2]
                except Exception as e:
                    item['chapterName'] = content[i].split(' ')[1[-3:]]
                yield scrapy.Request(url[i], callback="parseContent", meta={'item':item})

    def parseContent(self, response):
        seletor = Selector(response)
        item = response.meta['item']
        html = seletor.xpath('//div[@class="content"]').extract()[0]
        textField = re.search('<div> style="clear:both"></div>(.*>)<div', html, re.S).group()
        text = re.findall('<p>(.*)></p>', textField, re.S)
        fulltext = ""
        for each in text:
            fulltext += each
        item['text'] = fulltext
        yield item

 

posted on 2018-10-05 15:53  myworldworld  阅读(478)  评论(0)    收藏  举报

导航