【Jabberd2源码剖析系列 xhash】

xhash是jabberd2的哈希表, 并提供了迭代器用于遍历xhash.

 

解释一下结构体的命名, xht_struct意思是x hash tasble, xhn_struct意思是x hash node, 这样方便理解记忆.

xhn_struct的成员变量顾名思义, 不赘述.

xht_struct中, p是内存池, 负责node的分配等, zen是桶数组, free_list是回收的node内存, iter_bucket和iter_node被用于记录迭代器的位置.

typedef struct xhn_struct
{
    struct xhn_struct *next;
    struct xhn_struct *prev;
    const char *key;
    int keylen;
    void *val;
} *xhn, _xhn;

typedef struct xht_struct
{
    pool_t p;
    int prime;
    int dirty;
    int count;
    struct xhn_struct *zen;
    struct xhn_struct *free_list; // list of zaped elements to be reused.
    int iter_bucket;
    xhn iter_node;
    int *stat;
} *xht, _xht;

JABBERD2_API xht xhash_new(int prime);
JABBERD2_API void xhash_put(xht h, const char *key, void *val);
JABBERD2_API void xhash_putx(xht h, const char *key, int len, void *val);
JABBERD2_API void *xhash_get(xht h, const char *key);
JABBERD2_API void *xhash_getx(xht h, const char *key, int len);
JABBERD2_API void xhash_zap(xht h, const char *key);
JABBERD2_API void xhash_zapx(xht h, const char *key, int len);
JABBERD2_API void xhash_stat(xht h);
JABBERD2_API void xhash_free(xht h);
typedef void (*xhash_walker)(const char *key, int keylen, void *val, void *arg);
JABBERD2_API void xhash_walk(xht h, xhash_walker w, void *arg);
JABBERD2_API int xhash_dirty(xht h);
JABBERD2_API int xhash_count(xht h);
JABBERD2_API pool_t xhash_pool(xht h);

/* iteration functions */
JABBERD2_API int xhash_iter_first(xht h);
JABBERD2_API int xhash_iter_next(xht h);
JABBERD2_API void xhash_iter_zap(xht h);
JABBERD2_API int xhash_iter_get(xht h, const char **key, int *keylen, void **val);

首先是二进制哈希函数, 会根据一段内存算出哈希值.

/* Generates a hash code for a string.
 * This function uses the ELF hashing algorithm as reprinted in 
 * Andrew Binstock, "Hashing Rehashed," Dr. Dobb's Journal, April 1996.
 */
static int _xhasher(const char *s, int len)
{
    /* ELF hash uses unsigned chars and unsigned arithmetic for portability */
    const unsigned char *name = (const unsigned char *)s;
    unsigned long h = 0, g;
    int i;

    for(i=0;i<len;i++)
    { /* do some fancy bitwanking on the string */
        h = (h << 4) + (unsigned long)(name[i]);
        if ((g = (h & 0xF0000000UL))!=0)
            h ^= (g >> 24);
        h &= ~g;

    }

    return (int)h;
}

xhash_new创建了一个预分配内存的pool, 尺寸满足了创建哈希桶数组以及哈希表自身, Prime是哈希桶的个数, 从命名来看作者希望传入的prime是素数, 但这在实现上来说并不是必须的.

xht xhash_new(int prime)
{
    xht xnew;
    pool_t p;

/*    log_debug(ZONE,"creating new hash table of size %d",prime); */

    /** 
     * NOTE:
     * all xhash's memory should be allocated from the pool by using pmalloco()/pmallocx(),
     * so that the xhash_free() can just call pool_free() simply.
     */
    
    p = pool_heap(sizeof(_xhn)*prime + sizeof(_xht));
    xnew = pmalloco(p, sizeof(_xht));
    xnew->prime = prime;
    xnew->p = p;
    xnew->zen = pmalloco(p, sizeof(_xhn)*prime); /* array of xhn size of prime */

    xnew->free_list = NULL;
        
    xnew->iter_bucket = -1; 
    xnew->iter_node = NULL;

#ifdef XHASH_DEBUG
    xnew->stat = pmalloco(p, sizeof(int)*prime );
#else
    xnew->stat = NULL;
#endif

    return xnew;
}

释放xhash则直接释放pool即可, 这一点不必多说...

void xhash_free(xht h)
{
/*    log_debug(ZONE,"hash free %X",h); */

    /// want to do more things? Please see the note in xhash_new() first.
    if(h) pool_free(h->p);

}

分配node采用的如下方法: 在实现上可能有一点迷惑性, 需要注意到哈希桶是实实在在分配了内存的数组, 而不是指针数组, 所以创建一个新的node时, 会先检查哈希桶那个Node是否被使用了, 如果没有使用则直接返回给用户使用. 否则, 需要另外获取一个新的Node, 此时优先检查free_list, 没有free_list则pmalloc重新分配一个node, 之后将该node插入到哈希桶的第一个结点之后(第一个结点是静态分配的). 

另外, 哈希桶链表是双向的.

static xhn _xhash_node_new(xht h, int index)
{
    xhn n;
    int i = index % h->prime;

    /* track total */
    h->count++;

#ifdef XHASH_DEBUG
    h->stat[i]++;
#endif

    // if the zen[i] is empty, reuse it, else get a new one.
    n = &h->zen[i];

    if( n->key != NULL )
    {
        if( h->free_list )
        {
            n = h->free_list;
            h->free_list = h->free_list->next;
        }else
            n = pmalloco(h->p, sizeof(_xhn));

        //add it to the bucket list head.
        n->prev = &h->zen[i];
        n->next = h->zen[i].next;

        if( n->next ) n->next->prev = n;
        h->zen[i].next = n;
    }

    return n;
}

下面的函数给定哈希值index, 将会定位到特定的哈希桶里顺序查找给定的key, 特别注意到, n->key != NULL 的判断, 一方面哈希桶的第一个node用key = NULL来表示未被使用, 另一方面, 当删除一个正在被迭代器指向的node时, 为了不影响接下来的迭代, 也会令key=NULL来表示删除.

static xhn _xhash_node_get(xht h, const char *key, int len, int index)
{
    xhn n;
    int i = index % h->prime;
    for(n = &h->zen[i]; n != NULL; n = n->next)
        if(n->key != NULL && (n->keylen==len) && (strncmp(key, n->key, len) == 0))
            return n;
    return NULL;
}

 插入一个元素到哈希表, 采用如下接口: 先_xhasher计算出key的哈希值index, 之后_xhash_node_get查找该key是否已经存在,如果已存在则直接替换其中的内容即可返回.

如果不存在, 则分配一个node(从free_list 或者 pool 中), 赋值其中的内容即可. 

两个接口的区别就是: 后者调用前者, 前者支持指定key的长度, 但实际上, 我发现这个哈希表只能支持字符串key, 因为_xhash_node_get里竟然用的是strncmp, 并且xhash_put里也是strlen计算的key长度.

void xhash_putx(xht h, const char *key, int len, void *val)                                                                                                            
{
    int index;
    xhn n;

    if(h == NULL || key == NULL)
        return;

    index = _xhasher(key,len);

    /* dirty the xht */
    h->dirty++;

    /* if existing key, replace it */
    if((n = _xhash_node_get(h, key, len, index)) != NULL)
    {
/*        log_debug(ZONE,"replacing %s with new val %X",key,val); */

        n->key = key;
        n->keylen = len;
        n->val = val;
        return;
    }

/*    log_debug(ZONE,"saving %s val %X",key,val); */

    /* new node */
    n = _xhash_node_new(h, index);
    n->key = key;
    n->keylen = len;
    n->val = val;
}

void xhash_put(xht h, const char *key, void *val)
{
    if(h == NULL || key == NULL) return;
    xhash_putx(h,key,strlen(key),val);
}

查询更加简单, 内部调用了上面的_xhash_node_get, 并做了一些参数校验.

void *xhash_getx(xht h, const char *key, int len)
{
    xhn n;

    if(h == NULL || key == NULL || len <= 0 || (n = _xhash_node_get(h, key, len, _xhasher(key,len))) == NULL)
    {
/*        log_debug(ZONE,"failed lookup of %s",key); */
        return NULL;
    }

/*    log_debug(ZONE,"found %s returning %X",key,n->val); */
    return n->val;
}                                                                                                                                                                      

void *xhash_get(xht h, const char *key)
{
    if(h == NULL || key == NULL) return NULL;
    return xhash_getx(h,key,strlen(key));
}

删除一个指定的key: 后者调用前者, 主要是操纵双向链表, 并且需要照顾到迭代器是否指向了要删除的node.

如果要删除的node不是哈希桶的那个静态结点(不需要删除, key=NULL就可以表示删除了), 并且也不是当前迭代到的结点, 那么就移除并插到free_list头部. 

对于哈希桶第一个静态Node与被迭代器指向的Node, 作者简单的令key=NULL表示删除, 仅此而已.

void xhash_zap_inner( xht h, xhn n, int index)
{
    int i = index % h->prime;

    // if element:n is in bucket list and it's not the current iter
    if( &h->zen[i] != n && h->iter_node != n )
    {
        if(n->prev) n->prev->next = n->next;
        if(n->next) n->next->prev = n->prev;

        // add it to the free_list head.
        n->prev = NULL;
        n->next = h->free_list;
        h->free_list = n;
    }

    //empty the value.
    n->key = NULL;
    n->val = NULL;

    /* dirty the xht and track the total */
    h->dirty++;
    h->count--;

#ifdef XHASH_DEBUG
    h->stat[i]--;
#endif
}

void xhash_zapx(xht h, const char *key, int len)
{
    xhn n;
    int index;

    if( !h || !key ) return;
    
    index = _xhasher(key,len);
    n = _xhash_node_get(h, key, len, index);
    if( !n ) return;

/*    log_debug(ZONE,"zapping %s",key); */

    xhash_zap_inner(h ,n, index );
}

下面是一些比较杂的函数, 其中xhash_dirty返回的dirty值是在每次插入与删除node时+1的, 在这里还看不出它的具体用途.

/** return the dirty flag (and reset) */
int xhash_dirty(xht h)
{
    int dirty;

    if(h == NULL) return 1;

    dirty = h->dirty;
    h->dirty = 0;
    return dirty;
}

/** return the total number of entries in this xht */
int xhash_count(xht h)
{
    if(h == NULL) return 0;

    return h->count;
}

/** get our pool */
pool_t xhash_pool(xht h)
{
    return h->p;
}

xhash提供了一个遍历哈希表的接口, 允许用户指定回调函数与自定义数据, 原理很简单:

void xhash_walk(xht h, xhash_walker w, void *arg)
{
    int i;
    xhn n;

    if(h == NULL || w == NULL)
        return;

/*    log_debug(ZONE,"walking %X",h); */

    for(i = 0; i < h->prime; i++)
        for(n = &h->zen[i]; n != NULL; n = n->next)
            if(n->key != NULL && n->val != NULL)
                (*w)(n->key, n->keylen, n->val, arg);
}

剩下的是迭代器: 迭代器一方面提供了传统的迭代访问元素的方式, 另一方面其内部也在迭代的过程中回收了那些Key=NULL 或者val=NULL的正常node, 这些node是因为上一次迭代过程中zap删除迭代器指向的node引起的, 在这次迭代过程中将被回收到free_list中.

初始化迭代器: 令iter_bucket和iter_node为初始化状态, 前者表示当前迭代哪个桶, 后者表示迭代哪个结点. 最后会调用xhash_iter_next将迭代器挪到第一个Node.

/** iteration */
int xhash_iter_first(xht h) {
    if(h == NULL) return 0;

    h->iter_bucket = -1;
    h->iter_node = NULL;

    return xhash_iter_next(h);
}

令迭代器前进: 先让迭代node指向下一个node, 如果node为空, 那么说明当前的桶内没有node了, 必须迭代下一个哈希桶, 并在新的桶内找到一个key!=NULL&&val!=NULL的Node. 如果当前桶内还有剩余node, 那么令迭代node(iter_node)指向下一个node, 这里有一个while循环, 目的是因为可能迭代的node是之前被半删除的node, 这里会将它们回收到free_list中, 或者遇到一个key!=NULL&val!=NULL的Node则返回.

注:此处可以看出为什么初始化迭代器设置iter_node =NULL, iter_bucket= -1的原因.

int xhash_iter_next(xht h) {
    if(h == NULL) return 0;

    /* next in this bucket */
    h->iter_node = h->iter_node ? h->iter_node->next : NULL;
    while(h->iter_node != NULL) {
        xhn n = h->iter_node;

        if(n->key != NULL && n->val != NULL)
            return 1;

        h->iter_node = n->next;

        if (n != &h->zen[h->iter_bucket]) {
            if(n->prev) n->prev->next = n->next;
            if(n->next) n->next->prev = n->prev;

            // add it to the free_list head.
            n->prev = NULL;
            n->next = h->free_list;
            h->free_list = n;
        }
    }

    /* next bucket */
    for(h->iter_bucket++; h->iter_bucket < h->prime; h->iter_bucket++) {
        h->iter_node = &h->zen[h->iter_bucket];

        while(h->iter_node != NULL) {
            if(h->iter_node->key != NULL && h->iter_node->val != NULL)
                return 1;

            h->iter_node = h->iter_node->next;
        }
    }

    /* there is no next */
    h->iter_bucket = -1;
    h->iter_node = NULL;

    return 0;
}  

剩下的是删除迭代器指向的Node: 此处不会真正的删除该Node, 只会令key =NULL, 并在下一轮新的迭代过程中被发现并回收到free_list. 之所以不删除是因为会影响接下来的迭代, 作者这样实现迭代器的删除并不是不能实现的更直接, 而是一种对C++ map迭代器类似的原则, 将迭代器正确操作的责任交给使用者.

void xhash_iter_zap(xht h)
{
    int index;

    if( !h || !h->iter_node ) return;

    index = _xhasher( h->iter_node->key, h->iter_node->keylen );

    xhash_zap_inner( h ,h->iter_node, index);
}

最后一个接口, 允许用户获取当前迭代器指向node的key和val, 传入的都是指针的地址: 这里严格判断, 一个空的容器的迭代器永远iter_node = NULL, 所以要判断仔细.

下面就是把用户需要的内容返回给用户.

int xhash_iter_get(xht h, const char **key, int *keylen, void **val) {
    if(h == NULL || (key == NULL && val == NULL) || (key != NULL && keylen == NULL)) return 0;

    if(h->iter_node == NULL) {
        if(key != NULL) *key = NULL;
        if(val != NULL) *val = NULL;
        return 0;
    }

    if(key != NULL) {
        *key = h->iter_node->key;
        *keylen = h->iter_node->keylen;
    }
    if(val != NULL) *val = h->iter_node->val;

    return 1;
}

 

在xhash的插入操作中, 并没有看到预想的pstrdup(key)的操作, 作者将key的副本生成的责任交给了用户自己, 而xhash内部的pool只负责分配xht, xhn的内存.

posted @ 2012-11-20 14:21  xmpp?  阅读(723)  评论(0编辑  收藏  举报