【Jabberd2源码剖析系列 nad】

nad是jabberd2自创的xml存储结构, 它基于expat xml库实现, 提供了更加快捷方便操作xml文档的方法.

nad的意思是not a dom, 意思就是使用自定义结构存储xml dom, 比dom更快更方便.

 

先读一下nad.h中的说明:

/** @file util/nad.h
  * @brief Not A DOM
  * @author Jeremie Miller
  * @author Robert Norris
  * $Date: 2004/05/05 23:49:38 $
  * $Revision: 1.3 $
  * 
  * NAD is very simplistic, and requires all string handling to use a length.
  * Apps using this must be aware of the structure and access it directly for
  * most information. NADs can only be built by successively using the _append_
  * functions correctly. After built, they can be modified using other
  * functions, or by direct access. To access cdata on an elem or attr, use
  * nad->cdata + nad->xxx[index].ixxx for the start, and .lxxx for len.
  *
  * Namespace support seems to work, but hasn't been thoroughly tested. in
  * particular, editing the NAD after its creation might have quirks. use at
  * your own risk! Note that nad_add_namespace() brings a namespace into scope
  * for the next element added with nad_append_elem(), nad_insert_elem() or
  * nad_wrap_elem() (and by extension, any of its subelements). This is the
  * same way that Expat does things, so nad_add_namespace() can be driven from
  * Expat's StartNamespaceDeclHandler. See nad_parse() for an example of how to
  * use Expat to drive NAD.
  */

 

先浏览一下数据结构, 之后根据接口实现来了解每个成员变量分别是什么用途.

struct nad_elem_st {
    int parent;
    int iname, lname;
    int icdata, lcdata; /* cdata within this elem (up to first child) */
    int itail, ltail; /* cdata after this elem */
    int attr;
    int ns; 
    int my_ns;
    int depth;
};

struct nad_attr_st {
    int iname, lname;
    int ival, lval;
    int my_ns;
    int next;
};

struct nad_ns_st {
    int iuri, luri;
    int iprefix, lprefix;
    int next;
};

typedef struct nad_st
{
    struct nad_elem_st *elems;
    struct nad_attr_st *attrs;
    struct nad_ns_st *nss;
    char *cdata;
    int *depths; /* for tracking the last elem at a depth */

    /* The size in bytes of the elems, attrs, nss and cdata buffers, respectively. */
    int elen, alen, nlen, clen, dlen;

    /* The number of elements of each type of that data that are actually stored in the elems, attrs, nss and cdata buffers, respectively. */
    int ecur, acur, ncur, ccur;

    int scope; /* currently scoped namespaces, get attached to the next element */
    struct nad_st *next; /* for keeping a list of nads */
} *nad_t;

nad.c中首先是两个开胃菜函数: _nad_cdata是关于nad_t->cdata成员的追加数组操作, 可以从代码中与结构体声明中的注释猜测到, ccur是cdata被使用的长度, clen是cdata总长度, 并且_nad_realloc, NAD_SAFE这种接口在util之前的代码里也遇到过, 所以并不难理解. 

注意到, _nad_cdata函数返回的是新添加的cdata在nad->cdata的起始位置偏移量.

static int _nad_realloc(void **oblocks, int len) 
{
    int nlen;

    /* round up to standard block sizes */
    nlen = (((len-1)/BLOCKSIZE)+1)*BLOCKSIZE;

    /* keep trying till we get it */
    *oblocks = realloc(*oblocks, nlen);
    return nlen;
}

/** this is the safety check used to make sure there's always enough mem */
#define NAD_SAFE(blocks, size, len) if((size) > len) len = _nad_realloc((void**)&(blocks),(size));

/** internal: append some cdata and return the index to it */
static int _nad_cdata(nad_t nad, const char *cdata, int len) 
{
    NAD_SAFE(nad->cdata, nad->ccur + len, nad->clen);

    memcpy(nad->cdata + nad->ccur, cdata, len);
    nad->ccur += len; 
    return nad->ccur - len; 
}

接下来看一下其他无关紧要的函数, 作为热身: 代码中的debug代码段不需要关注, 因为要揣测debug意图在当前情况下比较耗精力.

nad_new分配nad_st结构体, 令nad->scopt为-1, 该变量记录的是下一个插入结点的命名空间, 在nad.h头部的注释有一些概括说明帮助理解.

nad_copy顾名思义, 复制一个nad结构体, 其中的_nad_ptr_check不需要关注, 因为在非debug模式下它是空宏. 该函数首先nad_new创建一个新nad(使用了calloc), 之后就是一系列NAD_SAFE为新的nad分配内存, 因为各成员的len都是0, 并且指针都是NULL, 所以NAD_SAFE相当于malloc分配内存.最后将旧nad的各个成员拷贝给新nad的各个成员, 最后完成了copy.

nad_free仅仅free掉各个成员数组, 并释放nad结构体自身.(不要被debug宏影响)

nad_t nad_new(void)
{
    nad_t nad;

    nad = calloc(1, sizeof(struct nad_st));

    nad->scope = -1;

#ifdef NAD_DEBUG
    {
    char loc[24];
    snprintf(loc, sizeof(loc), "%x", (int) nad);
    xhash_put(_nad_alloc_tracked, pstrdup(xhash_pool(_nad_alloc_tracked), loc), (void *) 1);
    }
    _nad_ptr_check(__func__, nad);
#endif

    return nad;
}

nad_t nad_copy(nad_t nad)
{
    nad_t copy;

    _nad_ptr_check(__func__, nad);

    if(nad == NULL) return NULL;

    copy = nad_new();

    /* if it's not large enough, make bigger */
    NAD_SAFE(copy->elems, nad->elen, copy->elen);
    NAD_SAFE(copy->attrs, nad->alen, copy->alen);
    NAD_SAFE(copy->nss, nad->nlen, copy->nlen);
    NAD_SAFE(copy->cdata, nad->clen, copy->clen);

    /* copy all data */
    memcpy(copy->elems, nad->elems, nad->elen);
    memcpy(copy->attrs, nad->attrs, nad->alen);
    memcpy(copy->nss, nad->nss, nad->nlen);
    memcpy(copy->cdata, nad->cdata, nad->clen);

    /* sync data */
    copy->ecur = nad->ecur;
    copy->acur = nad->acur;
    copy->ncur = nad->ncur;
    copy->ccur = nad->ccur;

    copy->scope = nad->scope;

    return copy;
}

void nad_free(nad_t nad)
{
    if(nad == NULL) return;

#ifdef NAD_DEBUG
    _nad_ptr_check(__func__, nad);
    {
    char loc[24];
    snprintf(loc, sizeof(loc), "%x", (int) nad);
    xhash_zap(_nad_alloc_tracked, loc);
    xhash_put(_nad_free_tracked, pstrdup(xhash_pool(_nad_free_tracked), loc), (void *) nad);
    }
#endif

    /* Free nad */
    free(nad->elems);
    free(nad->attrs);
    free(nad->cdata);
    free(nad->nss);
    free(nad->depths);
#ifndef NAD_DEBUG
    free(nad);
#endif
}

 

要从0搞懂nad, 从nad_parse函数入手最好, 因为它使用expat解析xml文档, 并在expat回调过程中构造完成nad_t, 所以从中可以了解到nad是怎么操作的, 然后才可以针对操作看实现.

围绕nad_parse接口, 有一系列提供给expat的回调, 以及一个专用于nad_parse的结构体. build_data中nad是随着expat解析构造的nad结构体, depth是当前解析的标签深度, p是expat的解析器XML_Parser.

nad_parse中使用了一系列expat的接口, 必须清楚的知道其用途, 方便理解nad工作方式.

首先XML_ParserCreateNS创建一个expat xml解释器, 第一个参数是xml文档的编码, 传NULL则默认(根据xml声明<?xml encoding=utf-8?>或者默认使用utf-8), 第二个参数的用途比较重要, NS的意思是namespace, 当expat发现了一个处于命名空间的标签的时候, 不仅仅只返回标签名, 而是会用‘|’字符作为分隔符, 左边放置uri, 右边放置tag name(标签名), 即uri|tagname.(即匿名命名空间)

在此之后, 还将该parser赋值到build_data->p中, 最后调用XML_SetReturnNSTriplet, 该函数的作用还是针对处于命名空间中的标签的, 第二个参数非0会导致解析到标签时返回的格式进一步变化为: uri|tagname|prefix, 其中prefix是命名空间的缩写前缀.(即有名命名空间)

接下来, 有一个宏判断, 此处两行代码作用一致, 是为了防止XML的实体标签攻击导致内存耗尽, XML中允许存在一种特殊标签:<!ENTITY entity-name "entity-content"
], expat parser默认会在解析过程中为之分配内存并存储, 如果我们不禁止这个行为, 将会被攻击者利用导致耗尽服务器内存, 所以有了这两行代码. 

如果expat版本支持XML_StopParser, 那么XML_SetEntityDeclHandler(p, (void *) _nad_parse_entity_declaration);设置处理Entity的处理函数, 并在内部调用XML_StopParser退出expat解析, 如果不支持此接口, 那么XML_SetDefaultHandler设置第二个参数为NULL, 表示此类标签无法处理, 也会导致expat parser退出.

#ifdef HAVE_XML_STOPPARSER
/* Stop the parser if an entity declaration is hit. */
static void _nad_parse_entity_declaration(void *arg, const char *entityName,
                                          int is_parameter_entity, const char *value,
                                          int value_length, const char *base,
                                          const char *systemId, const char *publicId,
                                          const char *notationName)
{
    struct build_data *bd = (struct build_data *) arg;

    XML_StopParser(bd->p, XML_FALSE);
}
#endif

最后, XML_SetUserData设置回调用户数据即build_data, XML_SetElementHandler设置标签开始与结束的回调, XML_SetCharacterDataHandler设置处理普通文本数据的方法, XML_SetStartNamespaceDeclHandler设置遇见命名空间声明的回调. 之后调用XML_Parse执行解析, 第四个参数表示这是否是最后一次feed数据, 这里总是设置为1, 解析可能失败, 但也可能成功的情况下(expat对待不完整的标签也认为成功, 因为它是feed工作模式)build_data->depth不等于0, 那说明我们的xml文档不完整, 导致解析没有完整结束, 也认为失败了.

/** parse a buffer into a nad */

struct build_data {
    nad_t               nad;
    int                 depth;
    XML_Parser          p;
};

nad_t nad_parse(const char *buf, int len) {                                                                                                                            
    struct build_data bd;
    XML_Parser p;

    if(len == 0)
        len = strlen(buf);

    p = XML_ParserCreateNS(NULL, '|');
    if(p == NULL)
        return NULL;
    bd.p = p;

    XML_SetReturnNSTriplet(p, 1);
    /* Prevent the "billion laughs" attack against expat by disabling
     * internal entity expansion.  With 2.x, forcibly stop the parser
     * if an entity is declared - this is safer and a more obvious
     * failure mode.  With older versions, simply prevent expenansion
     * of such entities. */
#ifdef HAVE_XML_STOPPARSER
    XML_SetEntityDeclHandler(p, (void *) _nad_parse_entity_declaration);
#else
    XML_SetDefaultHandler(p, NULL);
#endif

    bd.nad = nad_new();
    bd.depth = 0;

    XML_SetUserData(p, (void *) &bd);
    XML_SetElementHandler(p, _nad_parse_element_start, _nad_parse_element_end);
    XML_SetCharacterDataHandler(p, _nad_parse_cdata);
    XML_SetStartNamespaceDeclHandler(p, _nad_parse_namespace_start);

    if(!XML_Parse(p, buf, len, 1)) {
        XML_ParserFree(p);
        nad_free(bd.nad);
        return NULL;
    }

    XML_ParserFree(p);

    if(bd.depth != 0)
        return NULL;

    return bd.nad;
}

先看_nad_parse_element_start, 在进入一个标签时被回调: 代码注释中描述的比较清晰, 标签可能是匿名命名空间的, 也可能是有名命名空间的(带prefix)的, 也可能不在命名空间内, 所以首先解析获得uri, elem, prefix. 接着尝试添加这个命名空间nad_add_namespace(bd->nad, uri, prefix), 该函数看起来返回了一个int ns来标示这个命名空间, 接下来调用 el = nad_append_elem(bd->nad, ns, elem, bd->depth); 将这个标签添加到nad中, 它会返回一个int el来标示这个标签, 它会受到nad_add_namespace的影响而处于其命名空间下.

紧接着, 遍历该标签下的所有属性值, 每个属性都可能处于一个命名空间下, 同样试图将解析到的命名空间先添加到该el标示的标签下(其实是为接下来对该elem添加的属性设置所属命名空间), 之后nad_append_attr(bd->nad, ns, elem, (char *) attr[1]);向该元素添加该属性的值(attr[0]是key, attr[1]是val), 该属性将会处于ns标示的命名空间下.

同时, 因为进入了一层标签, depth+1.

static void _nad_parse_element_start(void *arg, const char *name, const char **atts) {
    struct build_data *bd = (struct build_data *) arg;
    char buf[1024];
    char *uri, *elem, *prefix;
    const char **attr;
    int el, ns;

    /* make a copy */
    strncpy(buf, name, 1024);
    buf[1023] = '\0';

    /* expat gives us:
         prefixed namespaced elem: uri|elem|prefix
          default namespaced elem: uri|elem
               un-namespaced elem: elem
     */

    /* extract all the bits */
    uri = buf;
    elem = strchr(uri, '|');
    if(elem != NULL) {
        *elem = '\0';
        elem++;
        prefix = strchr(elem, '|');
        if(prefix != NULL) {
            *prefix = '\0';
            prefix++;
        }
        ns = nad_add_namespace(bd->nad, uri, prefix);
    } else {
        /* un-namespaced, just take it as-is */
        uri = NULL;
        elem = buf;
        prefix = NULL;
        ns = -1;
    }

    /* add it */
    el = nad_append_elem(bd->nad, ns, elem, bd->depth);

    /* now the attributes, one at a time */
    attr = atts;
    while(attr[0] != NULL) {

        /* make a copy */              
        strncpy(buf, attr[0], 1024);
        buf[1023] = '\0';

        /* extract all the bits */
        uri = buf;
        elem = strchr(uri, '|');
        if(elem != NULL) {
            *elem = '\0';
            elem++;
            prefix = strchr(elem, '|');
            if(prefix != NULL) {
                *prefix = '\0';
                prefix++;
            }
            ns = nad_append_namespace(bd->nad, el, uri, prefix);
        } else {
            /* un-namespaced, just take it as-is */
            uri = NULL;
            elem = buf;
            prefix = NULL;
            ns = -1;
        }

        /* add it */
        nad_append_attr(bd->nad, ns, elem, (char *) attr[1]);

        attr += 2;
    }

    bd->depth++;
}

 

posted @ 2012-11-20 16:06  xmpp?  阅读(562)  评论(0编辑  收藏  举报