# luluping

::  ::  ::  ::  ::

# 1 DJBX33A算法原理

DJBX33A (Daniel J. Bernstein, Times 33 with Addition)哈希算法速度非常快，并且分类非常好（冲突小，分布均匀）,是比较理想的字符串哈希算法，目前被广泛应用在多个软件项目中，例如：PHP,Python,Apache,NginxBerkeleyDB等。
DJBX33A算法简单实现：

unsigned long djbx33a_hash(const char *str, size_t len)
{
unsigned long hash = 0U;
for(size_t i = 0;i < len; ++i) {
hash = hash * 33 + (unsigned long)str[i];
/* or, hash = ((hash << 5) + hash) + (unsigned long)str[i];
* where, hash * 33 = ((hash << 5) + hash)
*/
}

return hash;
}

# 2 DJBX33A算法典型实现

## 2.1 PHP(zend_string.h)

static zend_always_inline zend_ulong zend_inline_hash_func(const char *str, size_t len)
{
zend_ulong hash = Z_UL(5381);

/* variant with the hash unrolled eight times */
for (; len >= 8; len -= 8) {
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
}
switch (len) {
case 7: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 6: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 5: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 4: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 3: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 2: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 1: hash = ((hash << 5) + hash) + *str++; break;
case 0: break;
EMPTY_SWITCH_DEFAULT_CASE()
}

/* Hash value can't be zero, so we always set the high bit */
#if SIZEOF_ZEND_LONG == 8
return hash | Z_UL(0x8000000000000000);
#elif SIZEOF_ZEND_LONG == 4
return hash | Z_UL(0x80000000);
#else
# error "Unknown SIZEOF_ZEND_LONG"
#endif
}

/* Associate type micro defination in other file*/
typedef uint32_t zend_ulong;
#define Z_UL(i) UINT32_C(i)
#define UINT32_C(c) c ## U

1

/*
* DJBX33A (Daniel J. Bernstein, Times 33 with Addition)
*
* This is Daniel J. Bernstein's popular times 33' hash function as
* posted by him years ago on comp.lang.c. It basically uses a function
* like hash(i) = hash(i-1) * 33 + str[i]''. This is one of the best
* known hash functions for strings. Because it is both computed very
* fast and distributes very well.
*
* The magic of number 33, i.e. why it works better than many other
* constants, prime or not, has never been adequately explained by
* anyone. So I try an explanation: if one experimentally tests all
* multipliers between 1 and 256 (as RSE did now) one detects that even
* numbers are not useable at all. The remaining 128 odd numbers
* (except for the number 1) work more or less all equally well. They
* all distribute in an acceptable way and this way fill a hash table
* with an average percent of approx. 86%.
*
* If one compares the Chi^2 values of the variants, the number 33 not
* even has the best value. But the number 33 and a few other equally
* good numbers like 17, 31, 63, 127 and 129 have nevertheless a great
* advantage to the remaining numbers in the large set of possible
* multipliers: their multiply operation can be replaced by a faster
* operation based on just one shift plus either a single addition
* or subtraction operation. And because a hash function has to both
* distribute good _and_ has to be very fast to compute, those few
* numbers should be preferred and seems to be the reason why Daniel J.
* Bernstein also preferred it.
*
*
*                  -- Ralf S. Engelschall <rse@engelschall.com>
*/
static zend_always_inline zend_ulong zend_inline_hash_func(const char *str, size_t len)
{
...
}

## 2.2 Apache(apr_hash.c)

static unsigned int hashfunc_default(const char *char_key, apr_ssize_t *klen,
unsigned int hash)
{
const unsigned char *key = (const unsigned char *)char_key;
const unsigned char *p;
apr_ssize_t i;

/*
* This is the popular times 33' hash algorithm which is used by
* perl and also appears in Berkeley DB. This is one of the best
* known hash functions for strings because it is both computed
* very fast and distributes very well.
*
* The originator may be Dan Bernstein but the code in Berkeley DB
* cites Chris Torek as the source. The best citation I have found
* is "Chris Torek, Hash function for text in C, Usenet message
* <27038@mimsy.umd.edu> in comp.lang.c , October, 1990." in Rich
* Salz's USENIX 1992 paper about INN which can be found at
* <http://citeseer.nj.nec.com/salz92internetnews.html>.
*
* The magic of number 33, i.e. why it works better than many other
* constants, prime or not, has never been adequately explained by
* anyone. So I try an explanation: if one experimentally tests all
* multipliers between 1 and 256 (as I did while writing a low-level
* data structure library some time ago) one detects that even
* numbers are not useable at all. The remaining 128 odd numbers
* (except for the number 1) work more or less all equally well.
* They all distribute in an acceptable way and this way fill a hash
* table with an average percent of approx. 86%.
*
* If one compares the chi^2 values of the variants (see
* Bob Jenkins Hashing Frequently Asked Questions'' at
* http://burtleburtle.net/bob/hash/hashfaq.html for a description
* of chi^2), the number 33 not even has the best value. But the
* number 33 and a few other equally good numbers like 17, 31, 63,
* 127 and 129 have nevertheless a great advantage to the remaining
* numbers in the large set of possible multipliers: their multiply
* operation can be replaced by a faster operation based on just one
* shift plus either a single addition or subtraction operation. And
* because a hash function has to both distribute good _and_ has to
* be very fast to compute, those few numbers should be preferred.
*
*                  -- Ralf S. Engelschall <rse@engelschall.com>
*/

if (*klen == APR_HASH_KEY_STRING) {
for (p = key; *p; p++) {
hash = hash * 33 + *p;
}
*klen = p - key;
}
else {
for (p = key, i = *klen; i; i--, p++) {
hash = hash * 33 + *p;
}
}

return hash;
}

hash = hashfunc_default(key, &klen, ht->seed);

1

## 2.3 BerkeleyDB(src\hash\hash_func.c)

/*  DJBX33A algorithm
* __ham_func4 --
*  Chris Torek's hash function.  Although this function performs only
*  slightly worse than __ham_func5 on strings, it performs horribly on
*  numbers.
*
* PUBLIC: u_int32_t __ham_func4 __P((DB *, const void *, u_int32_t));
*/
u_int32_t
__ham_func4(dbp, key, len)
DB *dbp;
const void *key;
u_int32_t len;
{
const u_int8_t *k;
u_int32_t h, loop;

if (dbp != NULL)
COMPQUIET(dbp, NULL);

if (len == 0)
return (0);

#define HASH4a  h = (h << 5) - h + *k++;
#define HASH4b  h = (h << 5) + h + *k++;
#define HASH4   HASH4b
h = 0;
k = key;

loop = (len + 8 - 1) >> 3;
switch (len & (8 - 1)) {
case 0:
do {
HASH4;
case 7:
HASH4;
case 6:
HASH4;
case 5:
HASH4;
case 4:
HASH4;
case 3:
HASH4;
case 2:
HASH4;
case 1:
HASH4;
} while (--loop);
}
return (h);
}

## 2.4 Python(pyhash.c)

Py_hash_t
_Py_HashBytes(const void *src, Py_ssize_t len)
{
Py_hash_t x;
/*
We make the hash of the empty string be 0, rather than using
(prefix ^ suffix), since this slightly obfuscates the hash secret
*/
if (len == 0) {
return 0;
}

#ifdef Py_HASH_STATS
hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
#endif

#if Py_HASH_CUTOFF > 0
if (len < Py_HASH_CUTOFF) {
/* Optimize hashing of very small strings with inline DJBX33A. */
Py_uhash_t hash;
const unsigned char *p = src;
hash = 5381; /* DJBX33A starts with 5381 */

switch(len) {
/* ((hash << 5) + hash) + *p == hash * 33 + *p */
case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 1: hash = ((hash << 5) + hash) + *p++; break;
default:
Py_UNREACHABLE();
}
hash ^= len;
hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
x = (Py_hash_t)hash;
}
else
#endif /* Py_HASH_CUTOFF */
x = PyHash_Func.hash(src, len);

if (x == -1)
return -2;
return x;
}

typedef Py_ssize_t      Py_hash_t;
typedef ssize_t         Py_ssize_t;

#ifdef MS_WIN64
typedef __int64 ssize_t;
#else
typedef _W64 int ssize_t;
#endif

#define Py_UNREACHABLE() abort()

typedef struct {
Py_hash_t (*const hash)(const void *, Py_ssize_t);
const char *name;
const int hash_bits;
const int seed_bits;
} PyHash_FuncDef;

# 3 DJBX33A算法相似实现

Tokyo Cabinet,Nginx等软件项目通过改变每次相乘的倍数(31,37)获得与DJBX33A相似哈希函数。

## 3.1 Tokyo Cabinet内存数据库

/* tctdb.c */
/* Get the hash value of a record.
pkbuf' specifies the pointer to the region of the primary key.
pksiz' specifies the size of the region of the primary key.
The return value is the hash value. */
static uint16_t tctdbidxhash(const char *pkbuf, int pksiz){
assert(pkbuf && pksiz && pksiz >= 0);
uint32_t hash = 19780211;
while(pksiz--) {
hash = hash * 37 + *(uint8_t *)pkbuf++;
}
return hash;
}

/* tchdb.c */
/* Get the bucket index of a record.
hdb' specifies the hash database object.
kbuf' specifies the pointer to the region of the key.
ksiz' specifies the size of the region of the key.
hp' specifies the pointer to the variable into which the second hash value is assigned.
The return value is the bucket index. */
static uint64_t tchdbbidx(TCHDB *hdb, const char *kbuf, int ksiz, uint8_t *hp){
assert(hdb && kbuf && ksiz >= 0 && hp);
uint64_t idx = 19780211;
uint32_t hash = 751;
const char *rp = kbuf + ksiz;
while(ksiz--) {
idx = idx * 37 + *(uint8_t *)kbuf++;
hash = (hash * 31) ^ *(uint8_t *)--rp;
}
*hp = hash;
return idx % hdb->bnum;
}

## 3.2 Nginx服务器

/* ngx_hash.c */
ngx_uint_t
ngx_hash_key(u_char *data, size_t len)
{
ngx_uint_t  i, key;

key = 0;

for (i = 0; i < len; i++) {
key = ngx_hash(key, data[i]);
}

return key;
}

/* ngx_hash.h */
#define ngx_hash(key, c)   ((ngx_uint_t) key * 31 + c)

7