学习hashtable，处理“海量”数据

直接上代码吧，哈希表的逻辑还是很简单的，目的是对比这几种方法的速度，重要的是参照代码，看输出结果：

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <sys/timeb.h>
  4 #include <fstream>
  5 #include <string>
  6 #include <vector>
  7 #include <list>
  8 #include <algorithm>
  9 #include <set>
 10 #include <hash_set>
 11 #include <hash_map>
 12 
 13 using namespace std;
 14 
 15 static const int hashtable_length    = 49157;
 16 
 17 // 用于定位一个Bucket
 18 unsigned int hash_function(const char* str)
 19 {
 20     const char* end_of_str = str+strlen(str);
 21     unsigned int sum = 0;
 22     while (end_of_str - str > 3)
 23     {
 24         sum = (sum + (unsigned int)*((unsigned int*)str))%hashtable_length;
 25         str += 4;
 26     }
 27     char tmp[4] = {0};
 28     strcpy(tmp, str);
 29     sum = (sum + (unsigned int)*((unsigned int*)tmp))%hashtable_length;
 30     memset(tmp, 0, 4);
 31 
 32     return sum;
 33 }
 34 
 35 // 用于在一个Buchet中查找目标
 36 bool find_in_bucket(list<string>& l, const char* str)
 37 {
 38     list<string>::iterator iter;
 39     unsigned int hash_key = hash_function(str);
 40     bool exist = false;
 41     for (iter = l.begin(); iter != l.end(); iter++)
 42         if (strcmp(str, iter->c_str()) == 0)
 43                 return true;
 44     return false;
 45 }
 46 
 47 // 用于把目标放到Bucket中
 48 int insert_in_bucket(list<string>& l, const char* str)
 49 {
 50     if (!find_in_bucket(l, str))
 51     {
 52         l.push_back(string(str));
 53         return l.size();
 54     }else
 55         return -1;
 56 }
 57 
 58 // 用于在整个hash表中查找目标
 59 bool find_in_hashtable(vector<list<string>>& v, const char* str)
 60 {
 61     return find_in_bucket(v[hash_function(str)], str);
 62 }
 63 
 64 // 用于在整个hash表中插入一个元素
 65 int insert_in_hashtable(vector<list<string>>& v, const char* str)
 66 {
 67     return insert_in_bucket(v[hash_function(str)], str);
 68 }
 69 
 70 // 过滤掉文本中的标点符号
 71 void filter(char* str)
 72 {
 73     while(*str++)
 74         if(*str == ',' || *str == '.' 
 75             || *str == '?' || *str == '-' 
 76             || *str == '\"' || *str == '\'' 
 77             || *str == ')' || *str == '('
 78             || *str == '!')
 79             *str = ' ';
 80 }
 81 
 82 // 读取一行中的一个单词
 83 char* get_word_from_buff(char* &buff, char* word)
 84 {
 85     while (*buff && *buff == ' ')
 86         buff++;
 87     if (!*buff)
 88         return NULL;
 89     int cnt = 0;
 90     while (*buff && *buff != ' ')
 91         word[cnt++] = *buff++;
 92     word[cnt] = 0;
 93     return buff;
 94 }
 95 
 96 int main()
 97 {
 98     // 对比哈希表和朴素方法的差别
 99     // 任务是存储一个文件中的英文单词，要求不能重复
100 
101     timeb time_begin;
102     timeb time_end;
103     ifstream input_file;
104     input_file.open("D:\\input.txt");
105     char buff[10241] = {0};    // 10KB的缓冲区
106     char word[100];
107     vector<string> vector_of_words;
108     ftime(&time_begin);
109     // 下面代码速度奇慢无比
110     while (input_file.getline(buff, 10240))
111     {
112         filter(buff);
113         char* ptr_to_buff = buff;
114         vector<string>::iterator iter = vector_of_words.begin();
115         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
116         {
117             int i = 0;
118             for (; i < vector_of_words.size(); i++)
119                 if(strcmp(vector_of_words[i].c_str(), word) == 0)
120                     break;
121             if (i >= vector_of_words.size())
122                 vector_of_words.push_back(string(word));
123         }
124     }
125 
126     ftime(&time_end);
127     unsigned int seconds = time_end.time - time_begin.time;
128     unsigned int miseconds = time_end.millitm - time_begin.millitm;
129     miseconds = seconds * 1000 + miseconds;
130     printf("朴素的方法：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, vector_of_words.size());
131 
132     input_file.close();
133     input_file.open("D:\\input.txt");
134     vector<list<string>> hashtable_of_words(hashtable_length, list<string>());
135     ftime(&time_begin);
136     int count = 0;
137     while (input_file.getline(buff, 10240))
138     {
139         filter(buff);
140         char* ptr_to_buff = buff;
141         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
142             if(insert_in_hashtable(hashtable_of_words, word) != -1)
143                 ++count;
144     }
145     ftime(&time_end);
146     seconds = time_end.time - time_begin.time;
147     miseconds = time_end.millitm - time_begin.millitm;
148     miseconds = seconds * 1000 + miseconds;
149     printf("hashtable：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, count);
150 
151     input_file.close();
152     input_file.open("D:\\input.txt");
153     set<string> set_of_words;
154     ftime(&time_begin);
155     while (input_file.getline(buff, 10240))
156     {
157         filter(buff);
158         char* ptr_to_buff = buff;
159         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
160             set_of_words.insert(string(word));
161     }
162     ftime(&time_end);
163     seconds = time_end.time - time_begin.time;
164     miseconds = time_end.millitm - time_begin.millitm;
165     miseconds = seconds * 1000 + miseconds;
166     printf("rbtree-set：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, set_of_words.size());
167 
168     input_file.close();
169     input_file.open("D:\\input.txt");
170     hash_map<string, int> hashmap_of_words;
171     ftime(&time_begin);
172     while (input_file.getline(buff, 10240))
173     {
174         filter(buff);
175         char* ptr_to_buff = buff;
176         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
177             hashmap_of_words[string(word)]++;
178     }
179     ftime(&time_end);
180     seconds = time_end.time - time_begin.time;
181     miseconds = time_end.millitm - time_begin.millitm;
182     miseconds = seconds * 1000 + miseconds;
183     printf("hash_map：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, hashmap_of_words.size());
184 
185     input_file.close();
186     input_file.open("D:\\input.txt");
187     hash_set<string> hashset_of_words;
188     ftime(&time_begin);
189 #if 0    // 下面代码速度奇慢无比，所以注释掉了实际没有执行，我等了半天，没有计算完，不知道是不是逻辑有问题～
190     while (input_file.getline(buff, 10240))
191     {
192         filter(buff);
193         char* ptr_to_buff = buff;
194         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
195             hashset_of_words.insert(string(word));
196     }
197 #endif
198     ftime(&time_end);
199     seconds = time_end.time - time_begin.time;
200     miseconds = time_end.millitm - time_begin.millitm;
201     miseconds = seconds * 1000 + miseconds;
202     printf("hash_set：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, hashset_of_words.size());
203 
204     system("pause");
205     return 0;
206 }

输出结果：

朴素的方法：     处理时间为：    238594  毫秒, 统计了28661个单词
hashtable：     处理时间为：    2312    毫秒, 统计了28661个单词
rbtree-set：    处理时间为：    13438   毫秒, 统计了28661个单词
hash_map：      处理时间为：    6953    毫秒, 统计了28661个单词
hash_set：      处理时间为：    0       毫秒, 统计了0个单词
请按任意键继续. . .

后来又做了小幅的改动：

 1 unsigned int hash_function_opt(const char* str)
 2 {
 3     const char* end_of_str = str+strlen(str);
 4     unsigned int sum = 0;
 5     while (end_of_str - str > 3)
 6     {
 7         sum ^= *((unsigned int*)str);
 8         str += 4;
 9     }
10     char tmp[4] = {0};
11     strcpy(tmp, str);
12     sum ^= (unsigned int)*((unsigned int*)tmp);
13     sum %= hashtable_length;
14     memset(tmp, 0, 4);
15 
16     return sum;
17 }

1 bool find_in_bucket_opt(list<string>& l, const char* str)
2 {
3     list<string>::iterator iter;
4     for (iter = l.begin(); iter != l.end(); iter++)
5         if (strcmp(str, iter->c_str()) == 0)
6             return true;
7     return false;
8 }

1 void filter(char* str)
2 {
3     while(*str)
4     {
5         if(!((*str >= 'a' && *str <= 'z') || (*str >= 'A' && *str <= 'Z')))
6             *str = ' ';
7         str++;
8     }
9 }

 1     ofstream output_file;
 2     output_file.open("D:\\output.txt");
 3     vector<string> all_words;
 4     for(vector<list<string>>::iterator i_v = hashtable_of_words.begin(); i_v != hashtable_of_words.end(); i_v++)
 5         for(list<string>::iterator i_l = i_v->begin(); i_l != i_v->end(); i_l++)
 6             all_words.push_back(*i_l);
 7     sort(all_words.begin(), all_words.end());
 8     for(vector<string>::iterator i_v = all_words.begin(); i_v != all_words.end(); i_v++)
 9         output_file << *i_v <<endl;
10     output_file.close();

改动之后发现，之前的版本其实是有些小错误的，新的输出结果为：

朴素的方法：    处理时间为：    0       毫秒, 统计了0个单词
hashtable：     处理时间为：    2079    毫秒, 统计了27735个单词
hashtable2：    处理时间为：    2047    毫秒, 统计了27735个单词
rbtree-set：    处理时间为：    0       毫秒, 统计了0个单词
hash_map：      处理时间为：    0       毫秒, 统计了0个单词
hash_set：      处理时间为：    0       毫秒, 统计了0个单词
请按任意键继续. . .

posted @ 2013-09-23 01:17 铁甲小宝阅读(372) 评论(0) 收藏举报

刷新页面返回顶部