常用hash函数对比分析(一)

主要目标:寻找一个hash函数,高效的支持64位整数运算,使得在速度、空间等效率相对其它函数函数较高,以及内部运算时32位整数运算。

测试了"RSHash","JSHash","PJWHash","ELFHash","BKDRHash","SDBMHash","DJBHash","DEKHash","BPHash","FNVHash","APHash"在链式hash表情况下桶的利用率,以及碰撞率。结果分析就的文档丢失了(算是一次教训吧,没有及时对工作进行总结)

不足之处:没有分析各个hash函数的时间、空间效率,以及输入和输出的随机性或者正态性

自己设计的hash函数实现:

unsigned long long matrix[]={
12167790980612060479L, 13344170658793919998L, 8048017138412720916, 15332027341811451737L, 17032184339690126387L, 8116965580610093270, 7637225975870195902, 3639441384019100182, 15226039615865517844L, 8509997149940110407, 7567956886445709541, 11819756187988552427L, 4605006862157952628, 5016443212556328023, 9002359712050550171, 6632389880501523789, 14647710699697346216L, 9999004615168831203L, 2225254567237014321, 1466671485117024538, 10418811171466322749L, 14988736636330905679L, 5688277523122870273, 6044889914109049598, 14531873893893142015L, 10954372033167844835L, 394252017736706278, 2158134086325640502, 1901858137268079707, 12439218790442925755L, 10628840860977635074L, 1248934315101581618};

void changeToArray(unsigned long long input , int *array,int len) {
len = 64;
for(int i=0;i<len;i++) {
if(((input>>i)&0x1) == 1) {
array[i] = 1;
} else {
array[i] = 0;
}
}
}
int Matrix[32][64];
void changeToMatrix(){
for(int i=0;i<32;i++)
for(int j=0;j<64;j++) {
if(((matrix[i]>>j)&0x1)==1) {
Matrix[i][j] = 1;
} else {
Matrix[i][j] = 0;
}
}
}
unsigned int MatrixHash(unsigned long long input) {
int tmp[64];
changeToArray(input,tmp,64);
int res[32];
int tmpres[32];
for(int j=0;j<64;j++) {
for(int k=0;k<32;k++) {
res[k] = res[k] ^ tmpres[k];
tmpres[k] = 0;
}
for(int i=0;i<32;i++) {
if(tmp[j]==0) {
tmpres[i] = 0;
} else {
tmpres[i] = Matrix[i][j];
}
}
}
}

 

改进版本:

unsigned long matrix[]={
694164548, 3789149486, 3807463199, 2684797435, 2359943013, 2231240996, 2135863124, 1211164704, 2302089482, 4105647604, 3076642034, 72161852, 59560020, 611878035, 2814490697, 3915797072, 1219075273, 3978906545, 193953705, 4132630722, 2627050652, 1989142569, 2745032496, 735086709, 3485798578, 1637027010, 2467907528, 3110718702, 1201254602, 3737777921, 796676896, 2349326933, 2449810837, 3910433321, 3550767862, 1644342526, 1438602584, 2179832898, 3868189175, 1814084994, 615833333, 2031832363, 3130167043, 4200349959, 1930419194, 3787258680, 3369981751, 3329843958, 3446569769, 3858118227, 163055231, 1009035244, 422846498, 1011763997, 533633029, 2557610330, 4221463260, 95017657, 88369066, 3227540105, 3815919250, 2234633741, 1819183943, 556442040
};
void changeToArray(unsigned long long input , int *array,int len) {
len = 64;
for(int i=0;i<len;i++) {
if(((input>>i)&0x1) == 1) {
array[i] = 1;
} else {
array[i] = 0;
}
}
}
int Matrix[32][64];
void changeToMatrix(){
for(int i=0;i<32;i++)
for(int j=0;j<64;j++) {
if(((matrix[i]>>j)&0x1)==1) {
Matrix[i][j] = 1;
} else {
Matrix[i][j] = 0;
}
}
}
unsigned int MatrixHash(unsigned long long input) {
int tmp[64];
changeToArray(input,tmp,64); //也无需转变,要学会对整数的位进行操作
// for(int i=0;i<64;i++)
// cout<<tmp[i];
unsigned int res=0;
for(int i=0;i<64;i++) {
if(tmp[i]==1) {  //虽然可以并行,但由于运算非常简单,并行线程间切换带来的开销可能远比线程本身的开销大
res = res ^ matrix[i]; //随机数组无法判定是否可行或者高效,以及行满秩
}
}
return res;
}

/*************************************************************************
> File Name: hash_table.cpp
> Author:wjy
> Mail: wjy6719004@163.com
> Created Time: 二 7/ 8 09:54:52 2014
************************************************************************/

#include<iostream>
#include <time.h>
#include <string.h>
#include <ctime>
#include <stdlib.h>
#include <stdio.h>
using namespace std;


unsigned int RSHash(char* str, unsigned int len)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash = hash * a + (*str);
a = a * b;
}

return hash;
}
/* End Of RS Hash Function */


unsigned int JSHash(char* str, unsigned int len)
{
unsigned int hash = 1315423911;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash ^= ((hash << 5) + (*str) + (hash >> 2));
}

return hash;
}
/* End Of JS Hash Function */


unsigned int PJWHash(char* str, unsigned int len)
{
const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
const unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4);
const unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8);
const unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
unsigned int hash = 0;
unsigned int test = 0;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash = (hash << OneEighth) + (*str);

if((test = hash & HighBits) != 0)
{
hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}

return hash;
}
/* End Of P. J. Weinberger Hash Function */


unsigned int ELFHash(char* str, unsigned int len)
{
unsigned int hash = 0;
unsigned int x = 0;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash = (hash << 4) + (*str);
if((x = hash & 0xF0000000L) != 0)
{
hash ^= (x >> 24);
}
hash &= ~x;
}

return hash;
}
/* End Of ELF Hash Function */


unsigned int BKDRHash(char* str, unsigned int len)
{
unsigned int seed = 131; /* 31 131 1313 13131 131313 etc.. */
unsigned int hash = 0;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash = (hash * seed) + (*str);
}

return hash;
}
/* End Of BKDR Hash Function */


unsigned int SDBMHash(char* str, unsigned int len)
{
unsigned int hash = 0;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash = (*str) + (hash << 6) + (hash << 16) - hash;
}

return hash;
}
/* End Of SDBM Hash Function */


unsigned int DJBHash(char* str, unsigned int len)
{
unsigned int hash = 5381;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash = ((hash << 5) + hash) + (*str);
}

return hash;
}
/* End Of DJB Hash Function */


unsigned int DEKHash(char* str, unsigned int len)
{
unsigned int hash = len;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash = ((hash << 5) ^ (hash >> 27)) ^ (*str);
}
return hash;
}
/* End Of DEK Hash Function */


unsigned int BPHash(char* str, unsigned int len)
{
unsigned int hash = 0;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = hash << 7 ^ (*str);
}

return hash;
}
/* End Of BP Hash Function */


unsigned int FNVHash(char* str, unsigned int len)
{
const unsigned int fnv_prime = 0x811C9DC5;
unsigned int hash = 0;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash *= fnv_prime;
hash ^= (*str);
}

return hash;
}
/* End Of FNV Hash Function */


unsigned int APHash(char* str, unsigned int len)
{
unsigned int hash = 0xAAAAAAAA;
unsigned int i = 0;

for(i = 0; i < len; str++, i++)
{
hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (*str) * (hash >> 3)) :
(~((hash << 11) + (*str) ^ (hash >> 5)));
}

return hash;
}
/* End Of AP Hash Function */
const int NUM = 1000003; //10000019
struct HashNode {
unsigned int key;
unsigned int value;
HashNode *next;
bool flag;
HashNode(){
flag=false;
next = NULL;
key=0;
value = 0;
}
HashNode(int key,int value):key(key),value(value){
next = NULL;
flag = false;
}
};
HashNode hnode[NUM];
unsigned int (*hashvalue)(char *,unsigned int);
void setHash(int choice) {
switch(choice){
case 1: hashvalue = RSHash;
break;
case 2: hashvalue = JSHash;
break;
case 3: hashvalue = PJWHash;
break;
case 4: hashvalue = ELFHash;
break;
case 5: hashvalue = BKDRHash;
break;
case 6: hashvalue = SDBMHash;
break;
case 7: hashvalue = DJBHash;
break;
case 8: hashvalue = DEKHash;
break;
case 9: hashvalue = BPHash;
break;
case 10: hashvalue = FNVHash;
break;
case 11: hashvalue = APHash;
break;
default:
break;
}
}
/*unsigned int hashvalue(char *str) {
register unsigned int h;
register unsigned char *p;
for(h=0,p=(unsigned char *)str;*p;p++) {
h = (h<<5) - h + (*p); // h = 31 * h + *p;
}
return h;
}*/
bool isContainsKey(int key) {
char tmp[65];
// itoa(key,tmp,10);
sprintf(tmp,"%d",key);
int len = strlen(tmp);
unsigned int hashcode = hashvalue(tmp,len)%NUM;
if(hnode[hashcode].flag==false) {
return false;
} else {
HashNode *p = &hnode[hashcode];
while(p!=NULL) {
if(p->key==key) {
return true;
}
p = p -> next;
}
}
return false;
}
void put(int key,int value) {
char tmp[65];
// itoa(key,tmp,10);
sprintf(tmp,"%d",key);
int len=strlen(tmp);
unsigned int hashcode = hashvalue(tmp,len)%NUM;
if(hnode[hashcode].flag==false) {
hnode[hashcode].flag = true;
hnode[hashcode].key = key;
hnode[hashcode].value = value;
} else if(!isContainsKey(key)){
HashNode *p = new HashNode(key,value);
p->flag = true;
p->next = hnode[hashcode].next;
hnode[hashcode].next = p;
} else {
HashNode *p = &hnode[hashcode];
while(p!=NULL) {
if(p->key==key) {
p->value = value;
break;
}
p = p -> next;
}
}
}
int get(int key){
char tmp[65];
// itoa(key,tmp,10);
sprintf(tmp,"%d",key);
int len = strlen(tmp);
unsigned int hashcode = hashvalue(tmp,len)%NUM;
if(isContainsKey(key)==false){
// alert("the key not exists");
return -1;
} else {
HashNode *p = &hnode[hashcode];
while(p!=NULL) {
if(p->key==key) {
return p->value;
}
p = p -> next;
}
}
return -1;
}
int main(){
// int seed = time(NULL);
cout<<"**************hash 函数测试******************"<<endl;
printf("General Purpose Hash Function Algorithms Test\n");
printf(" 1. RS-Hash Function Value\n ");
printf(" 2. JS-Hash Function Value\n ");
printf(" 3. PJW-Hash Function Value\n ");
printf(" 4. ELF-Hash Function Value\n");
printf(" 5. BKDR-Hash Function Value\n");
printf(" 6. SDBM-Hash Function Value\n");
printf(" 7. DJB-Hash Function Value\n");
printf(" 8. DEK-Hash Function Value\n");
printf(" 9. BP-Hash Function Value\n");
printf("10. FNV-Hash Function Value\n");
printf("11. AP-Hash Function Value\n");
string hash_names[]={"0","RSHash","JSHash","PJWHash","ELFHash","BKDRHash","SDBMHash","DJBHash","DEKHash","BPHash","FNVHash","APHash"};
string count_names[]={"0","RSHash_count","JSHash_count","PJWHash_count","ELFHash_count","BKDRHash_count","SDBMHash_count","DJBHash_count","DEKHash_count","BPHash_count","FNVHash_count","APHash_count"};
for(int i=1;i<=11;i++) {
setHash(i);
FILE *fp = fopen("data.txt","r");
if(fp==NULL) {
cout<<"end"<<endl;
exit(-1);
}
char key[65];
while(!feof(fp)) {
fscanf(fp,"%s",key);
// cout<<key<<endl;
// i++;
put(atol(key),1);
}
fclose(fp);
fp = fopen(hash_names[i].c_str(),"w");
FILE* fp2 = fopen(count_names[i].c_str(),"w");
int sum = 0;
for(int i=0;i<NUM;i++) {
int count = 0;
HashNode *p = &hnode[i];
while(p!=NULL && p->flag!=false) {
count++;
fprintf(fp,"%d->%d\n",p->key,p->value);
p=p->next;
}
if(count > 0) {
sum += 1;
}
fprintf(fp,"***************\n");
if(count>=1) {
// cout<<count<<endl;
// fprintf(fp2,"%d th node has %d collision\n",i,count);
fprintf(fp2,"%d\n",count);
}
}
cout<<count_names[i]<<"\t"<<sum<<"\t"<<NUM<<endl;
fclose(fp2);
fclose(fp);
}
return 0;
}

 

结果测量的代码:

python生成64位随机数输入(思考如何保证无重复随机数,利用随机排序生成,冒泡排序)

#!/bin/python
import random
import os

array=[]
for i in range(32):
array.append(random.randint(0,0xffffffffffffffff))

matrix = open("matrix.txt","w");
for ele in array:
matrix.write(str(ele));
matrix.write('\n');
matrix.close()

data = open("data.txt","r");

larray=[]
for i in range(64):
larray.append(random.randint(0,0xffffffff))
lmatrix = open("lmatrix.txt","w");
for ele in larray:
lmatrix.write(str(ele));
lmatrix.write('\n');
lmatrix.close()

 

统计代码:

1、 排序

cat $1 |sort -n|uniq -c > $2  

2、对所有文件统计

#!/bin/sh
dir=$1
filelist=`ls $dir`
array=($filelist)
if [ -d tmp_result ]
then
rm -rf tmp_result
fi
mkdir tmp_result
for ((i=0;i<${#array[@]};i++));do
sh sort.sh $dir${array[$i]} ./tmp_result/${array[$i]}
echo $i
done

3、求和

#!/bin/python

import sys
import os

if len(sys.argv) <2 :
print "python sum.py file"
exit(0)
sum = 0
fp = open(sys.argv[1],"r")
for line in fp:
sum+=int(line.strip())

print sum

进一步参考文件:http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed (分析非常好,也提供了许多的参考文件)

整数hash速度非常高,强度相对可接收的hash算法:http://burtleburtle.net/bob/hash/integer.html 

posted @ 2014-08-06 23:33  purejade  阅读(2532)  评论(0编辑  收藏  举报