学习hashtable,处理“海量”数据
直接上代码吧,哈希表的逻辑还是很简单的,目的是对比这几种方法的速度,重要的是参照代码,看输出结果:
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <sys/timeb.h> 4 #include <fstream> 5 #include <string> 6 #include <vector> 7 #include <list> 8 #include <algorithm> 9 #include <set> 10 #include <hash_set> 11 #include <hash_map> 12 13 using namespace std; 14 15 static const int hashtable_length = 49157; 16 17 // 用于定位一个Bucket 18 unsigned int hash_function(const char* str) 19 { 20 const char* end_of_str = str+strlen(str); 21 unsigned int sum = 0; 22 while (end_of_str - str > 3) 23 { 24 sum = (sum + (unsigned int)*((unsigned int*)str))%hashtable_length; 25 str += 4; 26 } 27 char tmp[4] = {0}; 28 strcpy(tmp, str); 29 sum = (sum + (unsigned int)*((unsigned int*)tmp))%hashtable_length; 30 memset(tmp, 0, 4); 31 32 return sum; 33 } 34 35 // 用于在一个Buchet中查找目标 36 bool find_in_bucket(list<string>& l, const char* str) 37 { 38 list<string>::iterator iter; 39 unsigned int hash_key = hash_function(str); 40 bool exist = false; 41 for (iter = l.begin(); iter != l.end(); iter++) 42 if (strcmp(str, iter->c_str()) == 0) 43 return true; 44 return false; 45 } 46 47 // 用于把目标放到Bucket中 48 int insert_in_bucket(list<string>& l, const char* str) 49 { 50 if (!find_in_bucket(l, str)) 51 { 52 l.push_back(string(str)); 53 return l.size(); 54 }else 55 return -1; 56 } 57 58 // 用于在整个hash表中查找目标 59 bool find_in_hashtable(vector<list<string>>& v, const char* str) 60 { 61 return find_in_bucket(v[hash_function(str)], str); 62 } 63 64 // 用于在整个hash表中插入一个元素 65 int insert_in_hashtable(vector<list<string>>& v, const char* str) 66 { 67 return insert_in_bucket(v[hash_function(str)], str); 68 } 69 70 // 过滤掉文本中的标点符号 71 void filter(char* str) 72 { 73 while(*str++) 74 if(*str == ',' || *str == '.' 75 || *str == '?' || *str == '-' 76 || *str == '\"' || *str == '\'' 77 || *str == ')' || *str == '(' 78 || *str == '!') 79 *str = ' '; 80 } 81 82 // 读取一行中的一个单词 83 char* get_word_from_buff(char* &buff, char* word) 84 { 85 while (*buff && *buff == ' ') 86 buff++; 87 if (!*buff) 88 return NULL; 89 int cnt = 0; 90 while (*buff && *buff != ' ') 91 word[cnt++] = *buff++; 92 word[cnt] = 0; 93 return buff; 94 } 95 96 int main() 97 { 98 // 对比哈希表和朴素方法的差别 99 // 任务是存储一个文件中的英文单词,要求不能重复 100 101 timeb time_begin; 102 timeb time_end; 103 ifstream input_file; 104 input_file.open("D:\\input.txt"); 105 char buff[10241] = {0}; // 10KB的缓冲区 106 char word[100]; 107 vector<string> vector_of_words; 108 ftime(&time_begin); 109 // 下面代码速度奇慢无比 110 while (input_file.getline(buff, 10240)) 111 { 112 filter(buff); 113 char* ptr_to_buff = buff; 114 vector<string>::iterator iter = vector_of_words.begin(); 115 while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word)) 116 { 117 int i = 0; 118 for (; i < vector_of_words.size(); i++) 119 if(strcmp(vector_of_words[i].c_str(), word) == 0) 120 break; 121 if (i >= vector_of_words.size()) 122 vector_of_words.push_back(string(word)); 123 } 124 } 125 126 ftime(&time_end); 127 unsigned int seconds = time_end.time - time_begin.time; 128 unsigned int miseconds = time_end.millitm - time_begin.millitm; 129 miseconds = seconds * 1000 + miseconds; 130 printf("朴素的方法:\t处理时间为:\t%u\t毫秒, 统计了%d个单词\n", miseconds, vector_of_words.size()); 131 132 input_file.close(); 133 input_file.open("D:\\input.txt"); 134 vector<list<string>> hashtable_of_words(hashtable_length, list<string>()); 135 ftime(&time_begin); 136 int count = 0; 137 while (input_file.getline(buff, 10240)) 138 { 139 filter(buff); 140 char* ptr_to_buff = buff; 141 while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word)) 142 if(insert_in_hashtable(hashtable_of_words, word) != -1) 143 ++count; 144 } 145 ftime(&time_end); 146 seconds = time_end.time - time_begin.time; 147 miseconds = time_end.millitm - time_begin.millitm; 148 miseconds = seconds * 1000 + miseconds; 149 printf("hashtable:\t处理时间为:\t%u\t毫秒, 统计了%d个单词\n", miseconds, count); 150 151 input_file.close(); 152 input_file.open("D:\\input.txt"); 153 set<string> set_of_words; 154 ftime(&time_begin); 155 while (input_file.getline(buff, 10240)) 156 { 157 filter(buff); 158 char* ptr_to_buff = buff; 159 while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word)) 160 set_of_words.insert(string(word)); 161 } 162 ftime(&time_end); 163 seconds = time_end.time - time_begin.time; 164 miseconds = time_end.millitm - time_begin.millitm; 165 miseconds = seconds * 1000 + miseconds; 166 printf("rbtree-set:\t处理时间为:\t%u\t毫秒, 统计了%d个单词\n", miseconds, set_of_words.size()); 167 168 input_file.close(); 169 input_file.open("D:\\input.txt"); 170 hash_map<string, int> hashmap_of_words; 171 ftime(&time_begin); 172 while (input_file.getline(buff, 10240)) 173 { 174 filter(buff); 175 char* ptr_to_buff = buff; 176 while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word)) 177 hashmap_of_words[string(word)]++; 178 } 179 ftime(&time_end); 180 seconds = time_end.time - time_begin.time; 181 miseconds = time_end.millitm - time_begin.millitm; 182 miseconds = seconds * 1000 + miseconds; 183 printf("hash_map:\t处理时间为:\t%u\t毫秒, 统计了%d个单词\n", miseconds, hashmap_of_words.size()); 184 185 input_file.close(); 186 input_file.open("D:\\input.txt"); 187 hash_set<string> hashset_of_words; 188 ftime(&time_begin); 189 #if 0 // 下面代码速度奇慢无比,所以注释掉了实际没有执行,我等了半天,没有计算完,不知道是不是逻辑有问题~ 190 while (input_file.getline(buff, 10240)) 191 { 192 filter(buff); 193 char* ptr_to_buff = buff; 194 while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word)) 195 hashset_of_words.insert(string(word)); 196 } 197 #endif 198 ftime(&time_end); 199 seconds = time_end.time - time_begin.time; 200 miseconds = time_end.millitm - time_begin.millitm; 201 miseconds = seconds * 1000 + miseconds; 202 printf("hash_set:\t处理时间为:\t%u\t毫秒, 统计了%d个单词\n", miseconds, hashset_of_words.size()); 203 204 system("pause"); 205 return 0; 206 }
输出结果:
朴素的方法: 处理时间为: 238594 毫秒, 统计了28661个单词 hashtable: 处理时间为: 2312 毫秒, 统计了28661个单词 rbtree-set: 处理时间为: 13438 毫秒, 统计了28661个单词 hash_map: 处理时间为: 6953 毫秒, 统计了28661个单词 hash_set: 处理时间为: 0 毫秒, 统计了0个单词 请按任意键继续. . .
后来又做了小幅的改动:
1 unsigned int hash_function_opt(const char* str) 2 { 3 const char* end_of_str = str+strlen(str); 4 unsigned int sum = 0; 5 while (end_of_str - str > 3) 6 { 7 sum ^= *((unsigned int*)str); 8 str += 4; 9 } 10 char tmp[4] = {0}; 11 strcpy(tmp, str); 12 sum ^= (unsigned int)*((unsigned int*)tmp); 13 sum %= hashtable_length; 14 memset(tmp, 0, 4); 15 16 return sum; 17 }
1 bool find_in_bucket_opt(list<string>& l, const char* str) 2 { 3 list<string>::iterator iter; 4 for (iter = l.begin(); iter != l.end(); iter++) 5 if (strcmp(str, iter->c_str()) == 0) 6 return true; 7 return false; 8 }
1 void filter(char* str) 2 { 3 while(*str) 4 { 5 if(!((*str >= 'a' && *str <= 'z') || (*str >= 'A' && *str <= 'Z'))) 6 *str = ' '; 7 str++; 8 } 9 }
1 ofstream output_file; 2 output_file.open("D:\\output.txt"); 3 vector<string> all_words; 4 for(vector<list<string>>::iterator i_v = hashtable_of_words.begin(); i_v != hashtable_of_words.end(); i_v++) 5 for(list<string>::iterator i_l = i_v->begin(); i_l != i_v->end(); i_l++) 6 all_words.push_back(*i_l); 7 sort(all_words.begin(), all_words.end()); 8 for(vector<string>::iterator i_v = all_words.begin(); i_v != all_words.end(); i_v++) 9 output_file << *i_v <<endl; 10 output_file.close();
改动之后发现,之前的版本其实是有些小错误的,新的输出结果为:
朴素的方法: 处理时间为: 0 毫秒, 统计了0个单词 hashtable: 处理时间为: 2079 毫秒, 统计了27735个单词 hashtable2: 处理时间为: 2047 毫秒, 统计了27735个单词 rbtree-set: 处理时间为: 0 毫秒, 统计了0个单词 hash_map: 处理时间为: 0 毫秒, 统计了0个单词 hash_set: 处理时间为: 0 毫秒, 统计了0个单词 请按任意键继续. . .
浙公网安备 33010602011771号