封装一个缓存类,在爬取时可以调用,保存在数据库中

# coding=utf-8

import pickle
import zlib
from datetime import datetime,timedelta

import requests
from pymongo import MongoClient
from bson.binary import Binary


class MongoCache(object):
    """
    数据库缓存
    """
    def __init__(self,client=None,expires=timedelta(days=30)):
        self.client = MongoClient("localhost",27017)
        self.db = self.client.cache
        # 加速查找设置索引,设超时时间,如果达到expireAfterSeconds设置的超时时间,MongoDB会把超时数据自动删除
        self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())


    def __setitem__(self, key, value):
        '''
        timestamp:时间戳
        :param key:
        :param value:
        :return:
        '''
        # 压缩数据,设置时间戳
        record = {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()}
        # 使用update的upsert(如果不存在执行insert,存在执行update参数进行插入更新操作),$set内置函数表示覆盖原始数据
        self.db.webpage.update({"_id":key},{'$set':record},upsert=True)

    def __getitem__(self, item):
        # 根据_id以iteam作为关键字(例如url:http://www.baidu.com)查找相关网页
        record = self.db.webpage.find_one({"_id":item})
        if record:
            # return pickle.dumps(zlib.decompress(record["result"]))  #解压缩
            return pickle.loads(zlib.decompress(record["result"]))  #解压缩
        else:
            raise KeyError(item + "does not exist")  #找不到抛出异常

    def __contains__(self, item):
        try:
            self[item]   #这里会调用__getitem__方法
        except KeyError:
            return False #捕获到KeyError异常,说明没找到相关参数,参考33行抛出异常的条件
        else:
            return True  #找到相应数据说明数据库包含下载内容

    def clear(self):
        self.db.webpage.drop() #把缓存库清空


if __name__=='__main__':
    mongoCache = MongoCache()
    url = 'http://www.51hei.com/bbs/dpj-135132-1.html'
    response = requests.get(url)
    mongoCache[url] = response.content

 

posted on 2018-12-19 11:45  零度风格  阅读(243)  评论(0编辑  收藏  举报

导航