redis6.0.5之HyperLogLog阅读笔记4-基数估算实现之核心算法和命令

/* ========================= HyperLogLog Count ==============================
 * This is the core of the algorithm where the approximated count is computed.
 * The function uses the lower level hllDenseRegHisto() and hllSparseRegHisto()
 * functions as helpers to compute histogram of register values part of the
 * computation, which is representation-specific, while all the rest is common. */
这里是近似估算的核心算法。这个函数使用底层函数hllDenseRegHisto() 和 hllSparseRegHisto(),
来帮助计算机寄存器值的直方图,这个是基于特定表示的,剩下的都是通用的
**************************************************************************************************************
/* Implements the register histogram calculation for uint8_t data type
 * which is only used internally as speedup for PFCOUNT with multiple keys. */
对uint8_t类型的数据实现寄存器直方图的计算,该函数只在内部为了加速多个键时PFCOUNT使用
void hllRawRegHisto(uint8_t *registers, int* reghisto) {
    uint64_t *word = (uint64_t*) registers;  一次计算8个字节,64位
    uint8_t *bytes;
    int j;

    for (j = 0; j < HLL_REGISTERS/8; j++) {
        if (*word == 0) {
            reghisto[0] += 8;
        } else {
            bytes = (uint8_t*) word;
            reghisto[bytes[0]]++;
            reghisto[bytes[1]]++;
            reghisto[bytes[2]]++;
            reghisto[bytes[3]]++;
            reghisto[bytes[4]]++;
            reghisto[bytes[5]]++;
            reghisto[bytes[6]]++;
            reghisto[bytes[7]]++;
        }
        word++;
    }
}
**************************************************************************************************************
公式为 : x+sum(k=1->∞) [x^(2^k) * 2^(k-1)] 
从公式可以看出随着k变大,新增的项会趋于0
/* Helper function sigma as defined in
 * "New cardinality estimation algorithms for HyperLogLog sketches"
 * Otmar Ertl, arXiv:1702.01284 */
double hllSigma(double x) {
    if (x == 1.) return INFINITY;
    double zPrime;
    double y = 1;
    double z = x;
    do {
        x *= x;
        zPrime = z;
        z += x * y;
        y += y;
    } while(zPrime != z);
    return z;
}
**************************************************************************************************************
公式为: 1/3 *(1-x - sum(k=1->∞) [(1-x^(2^-k)) ^ 2 * 2^(-k)]  )
从公式可以看出,随着k的变大,x会趋于1,从而新增项也会趋于0
/* Helper function tau as defined in
 * "New cardinality estimation algorithms for HyperLogLog sketches"
 * Otmar Ertl, arXiv:1702.01284 */
double hllTau(double x) {
    if (x == 0. || x == 1.) return 0.;
    double zPrime;
    double y = 1.0;
    double z = 1 - x;
    do {
        x = sqrt(x);
        zPrime = z;
        y *= 0.5;
        z -= pow(1 - x, 2)*y;
    } while(zPrime != z);
    return z / 3;
}
**************************************************************************************************************
/* Return the approximated cardinality of the set based on the harmonic
 * mean of the registers values. 'hdr' points to the start of the SDS
 * representing the String object holding the HLL representation.
基于寄存器的调和平均值返回集合的近似基数。'hdr'指向保存HLL表达式的字符串对象的sds表示的开始之处。
 * If the sparse representation of the HLL object is not valid, the integer
 * pointed by 'invalid' is set to non-zero, otherwise it is left untouched.
如果稀疏HLL对象的稀疏表示无效,那么invalid指向的整数倍设置为非零,否则就无需设置了.
 * hllCount() supports a special internal-only encoding of HLL_RAW, that
 * is, hdr->registers will point to an uint8_t array of HLL_REGISTERS element.
 * This is useful in order to speedup PFCOUNT when called against multiple
 * keys (no need to work with 6-bit integers encoding). */
函数hllCount支持一种特殊的仅限内部使用的编码方式HLL_RAW,那就是hdr->registers将指向一个保存HLL_REGISTERS元素的uint8_t类型数组
这个对调用多个键时的命令FCOUNT变得更快很有作用(不需要在6bit整型编码的方式工作)
uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
    double m = HLL_REGISTERS;
    double E;
    int j;
    /* Note that reghisto size could be just HLL_Q+2, becuase HLL_Q+1 is
     * the maximum frequency of the "000...1" sequence the hash function is
     * able to return. However it is slow to check for sanity of the
     * input: instead we history array at a safe size: overflows will
     * just write data to wrong, but correctly allocated, places. */
注意到reghisto的大小可能为HLL_Q+2,因为HLL_Q+1是哈希函数能够返回的类似序列"000...1"最大的频率。
然而检查输入的合法性太慢了,作为替代,我们设置history数组为一个完全的值,
超出的数据会写入错误的位置但是已经分配过的地址(这样就不会报错了)
    int reghisto[64] = {0};  初始化为0

    /* Compute register histogram */  计算寄存器值的直方图
    if (hdr->encoding == HLL_DENSE) {  密集表示的情况下
        hllDenseRegHisto(hdr->registers,reghisto);
    } else if (hdr->encoding == HLL_SPARSE) {  稀疏表示的情况下
        hllSparseRegHisto(hdr->registers,
                         sdslen((sds)hdr)-HLL_HDR_SIZE,invalid,reghisto);
    } else if (hdr->encoding == HLL_RAW) {  原始表示
        hllRawRegHisto(hdr->registers,reghisto);
    } else {
        serverPanic("Unknown HyperLogLog encoding in hllCount()");
    }

    /* Estimate cardinality form register histogram. See:
     * "New cardinality estimation algorithms for HyperLogLog sketches"
     * Otmar Ertl, arXiv:1702.01284 */
     从寄存器直方图估算基数
    double z = m * hllTau((m-reghisto[HLL_Q+1])/(double)m); // c(q+1)
    for (j = HLL_Q; j >= 1; --j) {   //从 c(1)->c(q)
        z += reghisto[j];
        z *= 0.5;
    }
    z += m * hllSigma(reghisto[0]/(double)m);  //c(0)
    E = llroundl(HLL_ALPHA_INF*m*m/z);  获取基数的公式

    return (uint64_t) E;  返回估算的基数
}
**************************************************************************************************************
/* Call hllDenseAdd() or hllSparseAdd() according to the HLL encoding. */
基于HLL的编码调用函数hllDenseAdd() 或者 hllSparseAdd(), 添加新元素
int hllAdd(robj *o, unsigned char *ele, size_t elesize) {
    struct hllhdr *hdr = o->ptr;
    switch(hdr->encoding) {
    case HLL_DENSE: return hllDenseAdd(hdr->registers,ele,elesize);  密集
    case HLL_SPARSE: return hllSparseAdd(o,ele,elesize);  稀疏
    default: return -1; /* Invalid representation. */
    }
}
**************************************************************************************************************
/* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll'
 * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'.
通过计算registers[i]和hll[i]大的值,将hll和由max指向的uint8_t类型的寄存器数组合并
 * The hll object must be already validated via isHLLObjectOrReply()
 * or in some other way.
这个hll的对象必须是函数isHLLObjectOrReply验证有效的或者由另外方式验证有效的
 * If the HyperLogLog is sparse and is found to be invalid, C_ERR
 * is returned, otherwise the function always succeeds. */
 如果hll是稀疏表示并且是无效的,那么返回错误,否则函数总是返回成功。
int hllMerge(uint8_t *max, robj *hll) {
    struct hllhdr *hdr = hll->ptr;
    int i;

    if (hdr->encoding == HLL_DENSE) {  如果是密集表示
        uint8_t val;

        for (i = 0; i < HLL_REGISTERS; i++) {
            HLL_DENSE_GET_REGISTER(val,hdr->registers,i);获取位置i寄存器中的值
            if (val > max[i]) max[i] = val; 如果比max对应位置大,那么使用这个大值
        }
    } else {
        uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr);
        long runlen, regval;

        p += HLL_HDR_SIZE;
        i = 0;
        while(p < end) {  //稀疏模式采用遍历方式,值为0的情况无需判断,只有有值才判断
            if (HLL_SPARSE_IS_ZERO(p)) { 
                runlen = HLL_SPARSE_ZERO_LEN(p);
                i += runlen;
                p++;
            } else if (HLL_SPARSE_IS_XZERO(p)) {
                runlen = HLL_SPARSE_XZERO_LEN(p);
                i += runlen;
                p += 2;
            } else {
                runlen = HLL_SPARSE_VAL_LEN(p);
                regval = HLL_SPARSE_VAL_VALUE(p);
                if ((runlen + i) > HLL_REGISTERS) break; /* Overflow. */  溢出,稀疏表示格式错误
                while(runlen--) {
                    if (regval > max[i]) max[i] = regval; 判断谁的值大,留下大的值(就是谁的前导0更多)
                    i++;
                }
                p++;
            }
        }
        if (i != HLL_REGISTERS) return C_ERR;  //格式有误,返货错误
    }
    return C_OK;
}
**************************************************************************************************************
/* ========================== HyperLogLog commands ========================== */
hll相关命令
/* Create an HLL object. We always create the HLL using sparse encoding.
 * This will be upgraded to the dense representation as needed. */
创建一个HLL对象。我们总是用稀疏编码创建HLL对象。
如果需要,这个(稀疏表示)将会被密集表示取代。
robj *createHLLObject(void) {
    robj *o;
    struct hllhdr *hdr;
    sds s;
    uint8_t *p;
    这个式子可以根据HLL_SPARSE_XZERO_MAX_LEN的值调整所需空间大小
    int sparselen = HLL_HDR_SIZE +
                    (((HLL_REGISTERS+(HLL_SPARSE_XZERO_MAX_LEN-1)) /
                     HLL_SPARSE_XZERO_MAX_LEN)*2);  
                     
    int aux;

    /* Populate the sparse representation with as many XZERO opcodes as
     * needed to represent all the registers. */
    aux = HLL_REGISTERS;
    s = sdsnewlen(NULL,sparselen);
    p = (uint8_t*)s + HLL_HDR_SIZE;//跳过头结构,指向值的开始位置
    while(aux) {
        int xzero = HLL_SPARSE_XZERO_MAX_LEN;
        if (xzero > aux) xzero = aux; 如果xzero能表示的最大值大于寄存器的个数,那么只使用寄存器个数的值
        HLL_SPARSE_XZERO_SET(p,xzero); 设置xzero表示的值
        p += 2; xzero长度为2
        aux -= xzero; 如果xzero的最大长度小于寄存器的个数时候,需要用多个表示,那么这里就有需要,在实际的例子中,这里不需要循环,一个即可比保湿
    }
    serverAssert((p-(uint8_t*)s) == sparselen); 确保值部分的长度没有问题

    /* Create the actual object. */
    o = createObject(OBJ_STRING,s);  创建对象
    hdr = o->ptr;
    memcpy(hdr->magic,"HYLL",4);
    hdr->encoding = HLL_SPARSE;
    return o;  返回对象
}
**************************************************************************************************************
/* Check if the object is a String with a valid HLL representation.
 * Return C_OK if this is true, otherwise reply to the client
 * with an error and return C_ERR. */
检查对象是否是一个有效的HLL字符串表示,是返回C_OK,不是就返回客户端一个错误信息和C_ERR
int isHLLObjectOrReply(client *c, robj *o) {
    struct hllhdr *hdr;

    /* Key exists, check type */
    if (checkType(c,o,OBJ_STRING))  //检查对象类型
        return C_ERR; /* Error already sent. */ 错误信息已经在checkType时返回

    if (!sdsEncodedObject(o)) goto invalid;  非sds编码,无效
    if (stringObjectLen(o) < sizeof(*hdr)) goto invalid;  小于头部的长度,无效
    hdr = o->ptr;

    /* Magic should be "HYLL". */  魔数应该为HYLL,否则无效
    if (hdr->magic[0] != 'H' || hdr->magic[1] != 'Y' ||
        hdr->magic[2] != 'L' || hdr->magic[3] != 'L') goto invalid;

    if (hdr->encoding > HLL_MAX_ENCODING) goto invalid;  编码方式大于1,即不是稀疏也不是密集,无效

    /* Dense representation string length should match exactly. */  密集表示字符串长度应该精确匹配
    if (hdr->encoding == HLL_DENSE &&
        stringObjectLen(o) != HLL_DENSE_SIZE) goto invalid;

    /* All tests passed. */
    return C_OK;

invalid:  无效情况返回一个不是有效的hll字符串错误提示信息
    addReplySds(c,
        sdsnew("-WRONGTYPE Key is not a valid "
               "HyperLogLog string value.\r\n"));
    return C_ERR;
}
**************************************************************************************************************
/* PFADD var ele ele ele ... ele => :0 or :1 */
PFADD命令添加元素
void pfaddCommand(client *c) {
    robj *o = lookupKeyWrite(c->db,c->argv[1]);  检查库中是否有该键值
    struct hllhdr *hdr;
    int updated = 0, j;

    if (o == NULL) { 不存在该键值
        /* Create the key with a string value of the exact length to
         * hold our HLL data structure. sdsnewlen() when NULL is passed
         * is guaranteed to return bytes initialized to zero. */
         创建一个键和值,值的长度可以准确保存hll数据结构。
         函数sdsnewlen通过传入null保证构建一个初始化为0的字节数组
        o = createHLLObject();  创建对象
        dbAdd(c->db,c->argv[1],o); 添加键
        updated++;
    } else {
        if (isHLLObjectOrReply(c,o) != C_OK) return;键对应的不是hll类型的数据返回失败
        o = dbUnshareStringValue(c->db,c->argv[1],o);  创建一个非共享的数据库字符串对象
    }
    /* Perform the low level ADD operation for every element. */
    使用底层的add操作修改HLL每个元素
    for (j = 2; j < c->argc; j++) {
    依次添加每个元素
        int retval = hllAdd(o, (unsigned char*)c->argv[j]->ptr,
                               sdslen(c->argv[j]->ptr));
        switch(retval) {
        case 1:  添加成功
            updated++;
            break;
        case -1: 添加失败
            addReplySds(c,sdsnew(invalid_hll_err));
            return;
        }
    }
    hdr = o->ptr;
    if (updated) {  如果有修改,需要通知相关各方
        signalModifiedKey(c,c->db,c->argv[1]);  数据库中有更新,发送信号给相关各方
        notifyKeyspaceEvent(NOTIFY_STRING,"pfadd",c->argv[1],c->db->id); 发送pfadd事件
        server.dirty++;  脏键加1
        HLL_INVALIDATE_CACHE(hdr); 缓存基数原值无效
    }
    addReply(c, updated ? shared.cone : shared.czero);
}
**************************************************************************************************************
/* PFCOUNT var -> approximated cardinality of set. */
PFCOUNT命令  计算集合近似的基数
void pfcountCommand(client *c) {
    robj *o;
    struct hllhdr *hdr;
    uint64_t card;

    /* Case 1: multi-key keys, cardinality of the union.
    情形1 复合主键 联合的基数
     * When multiple keys are specified, PFCOUNT actually computes
     * the cardinality of the merge of the N HLLs specified. */
     当多个键被确定时,命令PFCOUNT实际计算这个N个HLLS所确定合集的基数。
    if (c->argc > 2) {  参数大于2个
        uint8_t max[HLL_HDR_SIZE+HLL_REGISTERS], *registers;
        int j;

        /* Compute an HLL with M[i] = MAX(M[i]_j). */ 对HLL的每个寄存器,保存最大值
        memset(max,0,sizeof(max)); 全部清零
        hdr = (struct hllhdr*) max; 指向头部的指针
        hdr->encoding = HLL_RAW; /* Special internal-only encoding. */内部编码方式
        registers = max + HLL_HDR_SIZE;  指向寄存器值开始的位置
        for (j = 1; j < c->argc; j++) {   遍历每个键,进行合并
            /* Check type and size. */  检查类型和大小
            robj *o = lookupKeyRead(c->db,c->argv[j]); 是否在数据库中
            if (o == NULL) continue; /* Assume empty HLL for non existing var.*/  假设不存在的键为空
            if (isHLLObjectOrReply(c,o) != C_OK) return;  不是HLL类型的数据,返回

            /* Merge with this HLL with our 'max' HLL by setting max[i]  用我们创建的内部max通过设置max[i]为MAX(max[i],hll[i])合并这个HLL
             * to MAX(max[i],hll[i]). */
            if (hllMerge(registers,o) == C_ERR) {  合并HLL
                addReplySds(c,sdsnew(invalid_hll_err)); 失败就返回信息
                return;
            }
        }

        /* Compute cardinality of the resulting set. */ 计算结果集的基数
        addReplyLongLong(c,hllCount(hdr,NULL));  返回总的数量(并集的数量)
        return;
    }

    /* Case 2: cardinality of the single HLL.
     *情形2 单键值的KLL基数
     * The user specified a single key. Either return the cached value
     * or compute one and update the cache. */
     用户确定一个键。返回缓存的值或者计算一个值并且更新缓存
    o = lookupKeyWrite(c->db,c->argv[1]);  查找数据库
    if (o == NULL) { 不在数据库中
        /* No key? Cardinality is zero since no element was added, otherwise
         * we would have a key as HLLADD creates it as a side effect. */
         库中不存在键,那么基数就是0,因为没有元素被添加,否则我们就会有键,因为函数HLLADD会创建一个
        addReply(c,shared.czero);
    } else {
        if (isHLLObjectOrReply(c,o) != C_OK) return; 非HLL对象,返回
        o = dbUnshareStringValue(c->db,c->argv[1],o);  创建一个非共享的数据库字符串对象

        /* Check if the cached cardinality is valid. */ 检查缓存的基数是否有效
        hdr = o->ptr;
        if (HLL_VALID_CACHE(hdr)) {
            /* Just return the cached value. */ 缓存基数有效的情况,直接返回缓存值
            card = (uint64_t)hdr->card[0];
            card |= (uint64_t)hdr->card[1] << 8;
            card |= (uint64_t)hdr->card[2] << 16;
            card |= (uint64_t)hdr->card[3] << 24;
            card |= (uint64_t)hdr->card[4] << 32;
            card |= (uint64_t)hdr->card[5] << 40;
            card |= (uint64_t)hdr->card[6] << 48;
            card |= (uint64_t)hdr->card[7] << 56;
        } else {
            int invalid = 0;
            /* Recompute it and update the cached value. */  重新计算并且更新缓存值
            card = hllCount(hdr,&invalid);  重新计算hll的DV(distinct value)
            if (invalid) {
                addReplySds(c,sdsnew(invalid_hll_err)); 出错返回错误信息
                return;
            }
            更新缓存的基数估算值
            hdr->card[0] = card & 0xff;
            hdr->card[1] = (card >> 8) & 0xff;
            hdr->card[2] = (card >> 16) & 0xff;
            hdr->card[3] = (card >> 24) & 0xff;
            hdr->card[4] = (card >> 32) & 0xff;
            hdr->card[5] = (card >> 40) & 0xff;
            hdr->card[6] = (card >> 48) & 0xff;
            hdr->card[7] = (card >> 56) & 0xff;
            /* This is not considered a read-only command even if the
             * data structure is not modified, since the cached value
             * may be modified and given that the HLL is a Redis string
             * we need to propagate the change. */
             这个命令不应该被认为是只读的,即使数据结构没有变动,因为缓存的值可能被修改,
             考虑到HLL是一个redis的字符串,我们需要传播这个变动
            signalModifiedKey(c,c->db,c->argv[1]);  通知相关关注各方
            server.dirty++; 有变动
        }
        addReplyLongLong(c,card); 返回基数
    }
}
**************************************************************************************************************
/* PFMERGE dest src1 src2 src3 ... srcN => OK */
PFMERGE 命令 合并多个HLL到一个HLL
void pfmergeCommand(client *c) {
    uint8_t max[HLL_REGISTERS];
    struct hllhdr *hdr;
    int j;
    int use_dense = 0; /* Use dense representation as target? */  用密集表示作为目标

    /* Compute an HLL with M[i] = MAX(M[i]_j).
     * We store the maximum into the max array of registers. We'll write
     * it to the target variable later. */
     计算一个hll,每个寄存器都是取所有源HLL中同样位置寄存器的最大值
     
    memset(max,0,sizeof(max));   初始化为0
    for (j = 1; j < c->argc; j++) {  遍历每个键
        /* Check type and size. */
        robj *o = lookupKeyRead(c->db,c->argv[j]);
        if (o == NULL) continue; /* Assume empty HLL for non existing var. */
        if (isHLLObjectOrReply(c,o) != C_OK) return;

        /* If at least one involved HLL is dense, use the dense representation
         * as target ASAP to save time and avoid the conversion step. */
         如果至少有一个hll是密集表示,尽快使用密集表示作为目标,从而避免转化的步骤来节省时间
        hdr = o->ptr;
        if (hdr->encoding == HLL_DENSE) use_dense = 1;

        /* Merge with this HLL with our 'max' HLL by setting max[i]
         * to MAX(max[i],hll[i]). */  取所有hll同样位置寄存器中的最大值到新的max中
        if (hllMerge(max,o) == C_ERR) {
            addReplySds(c,sdsnew(invalid_hll_err));
            return;
        }
    }

    /* Create / unshare the destination key's value if needed. */  如果需要,创建或者取消目标键的值
    robj *o = lookupKeyWrite(c->db,c->argv[1]);
    if (o == NULL) {
        /* Create the key with a string value of the exact length to
         * hold our HLL data structure. sdsnewlen() when NULL is passed
         * is guaranteed to return bytes initialized to zero. */
        创建具有精确长度的字符串值的键来保存HLL数据结构。函数sdsnewlen,当参数为null时,
        保证返回初始化为零的字节
        o = createHLLObject();
        dbAdd(c->db,c->argv[1],o);添加新键到数据库
    } else {
        /* If key exists we are sure it's of the right type/size
         * since we checked when merging the different HLLs, so we
         * don't check again. */
        如果键存在,我们确定它的类型/大小是正确的,因为我们在上面合并不同的hll时进行了检查,所以不会再次检查。
        o = dbUnshareStringValue(c->db,c->argv[1],o); 创建不共享的SDS字符串
    }

    /* Convert the destination object to dense representation if at least
     * one of the inputs was dense. */ 如果至少有一个输入hll是密集表示,那么目标对象需要转化为密集表示
    if (use_dense && hllSparseToDense(o) == C_ERR) { 转化失败的情况,返回提示错误
        addReplySds(c,sdsnew(invalid_hll_err));
        return;
    }

    /* Write the resulting HLL to the destination HLL registers and
     * invalidate the cached value. */
     将合并后的HLL写入目标HLL寄存器同时将缓存失效
    for (j = 0; j < HLL_REGISTERS; j++) {
        if (max[j] == 0) continue;  默认就是0,不用写
        hdr = o->ptr;
        switch(hdr->encoding) { 非0情况,根据表示不同分别写入
        case HLL_DENSE: hllDenseSet(hdr->registers,j,max[j]); break;
        case HLL_SPARSE: hllSparseSet(o,j,max[j]); break;
        }
    }
    hdr = o->ptr; /* o->ptr may be different now, as a side effect of
                     last hllSparseSet() call. */  因为hllSparseSet的调用,o->ptr可能会不同
    HLL_INVALIDATE_CACHE(hdr); 

    signalModifiedKey(c,c->db,c->argv[1]);  发出信号,通知订阅了修改过键的关注方
    /* We generate a PFADD event for PFMERGE for semantical simplicity
     * since in theory this is a mass-add of elements. */
    生成pfadd事件,因为理论上是大量元素的增加
    notifyKeyspaceEvent(NOTIFY_STRING,"pfadd",c->argv[1],c->db->id);
    server.dirty++;
    addReply(c,shared.ok);
}
**************************************************************************************************************

 

posted on 2021-04-23 17:04  子虚乌有  阅读(155)  评论(0)    收藏  举报