记一个男默女泪的 BUG

姗姗来迟的词频统计代码 BUG 的发现

1. 此前提交的第一次代码作业总结博客

http://www.cnblogs.com/ustczwq/p/8680704.html
2. BUG 本天成，妙手偶得之

虽然代码已经提交，但总是感觉哪个地方不太对，bug 存在得过于莫名其妙。然后，随手打开代码，稍微调试了一下，当我发现 bug 的时候，不知道该说些什么好，只想讲脏话。
出现 bug 的地方：
改过之后：
看出来了吧，妈卖批，三目运算符没赋值。改完之后，输出结果立马正确。怪不得用 unordered_map 的时候哈希表的查询出问题了，我 TM 定义的哈希函数有问题。虽然迟了，但那种优化是对的，简单补一篇，算是对原博客的完善。
3. 加了几个等于号之后的源代码

  1 #include "io.h"
  2 #include "math.h"
  3 #include "stdio.h"
  4 #include "string.h"
  5 #include "stdlib.h"
  6 #include "unordered_map"
  7 
  8 using namespace std;
  9    
 10 #define small 2
 11 
 12 int wordnum = 0;
 13 int charnum = 0;
 14 int linenum = 0;
 15 
 16 struct wordsdata                //存放单词信息
 17 {
 18     char words[1024];           //单词字符串
 19     int number;                 //出现次数
 20     wordsdata *next;
 21 };
 22 struct phrases
 23 {
 24     char *one;
 25     char *two;
 26     int num;
 27 };
 28 
 29 int wordcmp(char *str1, char *str2);
 30 int gettop(struct wordsdata **word);
 31 int getwords(char *path, struct wordsdata **word);
 32 int getfiles(char *path, struct _finddata_t *fileinfo, long handle);
 33 
 34 struct phrase_cmp
 35 {
 36     bool operator()(const phrases &p1, const phrases &p2) const
 37     {
 38         return ((wordcmp(p1.one, p2.one) < 2) && (wordcmp(p1.two, p2.two) < 2));
 39     }
 40 
 41 };
 42 struct phrase_hash
 43 {
 44     size_t operator()(const phrases &ph) const
 45     {
 46         unsigned long __h = 0;
 47         int temp;
 48         size_t i;
 49         for (i = 0; ph.one[i]; i++)
 50         {
 51             temp = ph.one[i];
 52             if (temp > 64)
 53             {
 54                 (temp > 96) ? (temp -= 96) : (temp -= 64);
 55                 __h += (29 * __h + temp);
 56                 __h %= 2147483647;
 57             }
 58 
 59         }
 60         for (i = 0; ph.two[i]; i++)
 61         {
 62             temp = ph.two[i];
 63             if (temp > 64)
 64             {
 65                 (temp > 96) ? (temp -= 96) : (temp -= 64);
 66                 __h += (29 * __h + temp);
 67                 __h %= 2147483647;
 68             }
 69         }
 70 
 71         return size_t(__h);
 72     }
 73 
 74 };
 75 
 76 typedef unordered_map<phrases, int, phrase_hash, phrase_cmp> Char_Phrase;
 77 Char_Phrase phrasemap;
 78 struct wordsdata *fourletter[26 * 26 * 26 * 26] = {}; //按首四字母排序
 79 
 80 int main()
 81 {
 82     int j = 0;                            
 83     long handle = 0;                           // 用于查找的句柄 
 84     struct _finddata_t fileinfo;               // 文件信息的结构体 
 85     char *path = __argv[1];
 86     
 87     getfiles(path, &fileinfo, handle);
 88 
 89     gettop(fourletter);
 90 
 91     system("pause");
 92     return 1;
 93 }
 94 
 95 int getfiles(char *path, struct _finddata_t *fileinfo, long handle)
 96 {                                    
 97     handle = _findfirst(path, fileinfo);            //第一次打开父目录
 98     if (handle == -1)
 99         return -1;
100 
101 
102     do
103     {
104         //printf("> %s\n", path);           //显示目录名
105 
106         if (fileinfo->attrib & _A_SUBDIR)           //如果读取到子目录
107         {
108             if (strcmp(fileinfo->name, ".") != 0 && strcmp(fileinfo->name, "..") != 0)
109             {
110                 char temppath[1024] = "";              //记录子目录路径
111                 long temphandle = 0;
112                 struct _finddata_t tempfileinfo;
113                 strcpy(temppath, path);
114                 strcat(temppath, "/*");
115 
116                 temphandle = _findfirst(temppath, &tempfileinfo);  //第一次打开子目录
117                 if (temphandle == -1)
118                     return -1;
119 
120                 do                              //对子目录所有文件递归
121                 {
122                     if (strcmp(tempfileinfo.name, ".") != 0 && strcmp(tempfileinfo.name, "..") != 0)
123                     {
124                         strcpy(temppath, path);
125                         strcat(temppath, "/");
126                         strcat(temppath, tempfileinfo.name);
127                         getfiles(temppath, &tempfileinfo, temphandle);
128                     }
129                 } while (_findnext(temphandle, &tempfileinfo) != -1);
130 
131                 _findclose(temphandle);
132             }//递归完毕
133 
134         } //子目录读取完毕
135         else
136             getwords(path, fourletter);
137 
138 
139     } while (_findnext(handle, fileinfo) != -1);
140 
141     _findclose(handle);       //关闭句柄
142 
143     return 1;
144 
145 }
146 
147 int getwords(char *path, struct wordsdata **word)
148 {
149     FILE *fp;
150     int j = 0;
151     int cmp = 0;
152     int num = 0;               //计算首四位地址
153     char temp = 0;             //读取一个字符 ACSII 码值
154     int length = 0;
155 
156     char present[1024] = "";  //存储当前单词
157 
158     char address[4] = "";
159     struct wordsdata *q = NULL;
160     struct wordsdata *pre = NULL;
161     struct wordsdata *neword = NULL;
162     struct wordsdata *now = NULL;
163     struct wordsdata *previous = NULL;
164     struct phrases *newphrase = NULL;
165 
166     if ((fp = fopen(path, "r")) == NULL)
167     {
168         //printf("error!!! \n", path);
169         return 0;
170     }
171     linenum++;
172     while (temp != -1)
173     {
174         //读取字符串
175         temp = fgetc(fp);
176         if (temp > 31 && temp < 127)
177             charnum++;
178         if (temp == '\n' || temp == '\r')
179             linenum++;
180 
181         while ((temp >= '0' && temp <= '9') || (temp >= 'a' && temp <= 'z') || (temp >= 'A' && temp <= 'Z'))
182         {
183             if (length != -1 && length < 4)
184             {
185                 if (temp >= 'A')  //是字母
186                 {
187                     present[length] = temp;
188                     address[length] = (temp >= 'a' ? (temp - 'a') : (temp - 'A'));
189                     length++;
190                 }
191                 else            //不是字母
192                     length = -1;
193             }
194             else if (length >= 4)
195             {
196                 present[length] = temp;
197                 length++;
198             }
199             temp = fgetc(fp);
200             if (temp > 31 && temp < 127)
201                 charnum++;
202             if (temp == '\n' || temp == '\r')
203                 linenum++;
204         } // end while
205 
206           //判断是否为单词
207         if (length >= 4)
208         {
209             wordnum++;
210 
211             //计算首四位代表地址
212             num = address[0] * 17576 + address[1] * 676 + address[2] * 26 + address[3];
213 
214             //插入当前单词
215             if (word[num] == NULL)
216             {
217                 word[num] = new wordsdata;
218                 neword = new wordsdata;
219                 neword->number = 1;
220                 neword->next = NULL;
221                 strcpy(neword->words, present);
222                 word[num]->next = neword;
223                 now = neword;
224             }
225             else
226             {
227                 pre = word[num];
228                 q = pre->next;
229                 cmp = wordcmp(q->words, present);
230 
231                 while (cmp == small)
232                 {
233                     pre = q;
234                     q = q->next;
235                     if (q != NULL)
236                         cmp = wordcmp(q->words, present);
237                     else
238                         break;
239                 }
240                 if (q != NULL && cmp <= 1)
241                 {
242                     now = q;
243                     q->number++;
244                     if (cmp == 1)
245                         strcpy(q->words, present);                
246                 }
247 
248                 else
249                 {
250                     neword = new wordsdata;
251                     neword->number = 1;
252                     strcpy(neword->words, present);
253                     pre->next = neword;
254                     neword->next = q;
255                     now = neword;
256                 }
257             }
258 
259             if (previous != NULL)
260             {
261                 newphrase = new phrases;
262 
263                 newphrase->one = previous->words;
264                 newphrase->two = now->words;
265                 
266                 unordered_map<phrases, int>::const_iterator got = phrasemap.find( *newphrase);
267                 if (got != phrasemap.end())
268                 {
269                     phrasemap[*newphrase]++;
270                 }
271                 else
272                 {
273                     phrasemap.insert(pair<phrases, int>(*newphrase, 1));
274                 }
275             }
276             previous = now;
277 
278             //当前单词置空
279             for (int j = 0; present[j] && j < 1024; j++)
280                 present[j] = 0;
281         }
282         length = 0;
283     }
284 
285     fclose(fp);
286     return 1;
287 }
288 
289 int wordcmp(char *str1, char *str2)
290 {
291     char *p1 = str1;
292     char *p2 = str2;
293     char q1 = *p1;
294     char q2 = *p2;
295 
296     if (q1 >= 'a' && q1 <= 'z')
297         q1 -= 32;
298 
299     if (q2 >= 'a' && q2 <= 'z')
300         q2 -= 32;
301 
302     while (q1 && q2 && q1 == q2)
303     {
304         p1++;
305         p2++;
306 
307         q1 = *p1;
308         q2 = *p2;
309 
310         if (q1 >= 'a' && q1 <= 'z')
311             q1 -= 32;
312 
313         if (q2 >= 'a' && q2 <= 'z')
314             q2 -= 32;
315     }
316 
317     while (*p1 >= '0' && *p1 <= '9')
318         p1++;
319     while (*p2 >= '0' && *p2 <= '9')
320         p2++;
321 
322     if (*p1 == 0 && *p2 == 0)           //两单词等价    
323         return strcmp(str1, str2);       //等价前者字典顺序小返回-1，大返回1，完全相等返回0
324 
325     if (q1 < q2)                   //前者小
326         return 2;
327 
328     if (q1 > q2)                   //后者小
329         return 3;
330 
331     return 4;
332 }
333 
334 int gettop(struct wordsdata **word)
335 {
336     int i = 0, j = 0;
337     struct wordsdata *topw[12] = {};
338     struct phrases *toph[12] = {};
339     struct wordsdata *w = NULL;
340     FILE *fp;
341     fp = fopen("result.txt", "w");
342     fprintf(fp,"characters:%d \nwords:%d \nlines:%d\n",  charnum,wordnum, linenum);
343 
344     for (j = 0; j < 12; j++)
345     {        
346         toph[j] = new struct phrases;
347         toph[j]->num = 0;
348         topw[j] = new struct wordsdata;
349         topw[j]->number = 0;
350     }
351     for (i = 0; i < 456976; i++)
352     {
353         if (word[i] != NULL)
354         {
355             w = word[i]->next;
356             while (w != NULL)
357             {
358                 topw[11]->number = w->number;
359                 topw[11]->next = w;
360                 j = 11;
361                 while (j > 1 && topw[j]->number > topw[j - 1]->number)
362                 {
363                     topw[0] = topw[j];
364                     topw[j] = topw[j - 1];
365                     topw[j - 1] = topw[0];
366                     j--;
367                 }
368                 w = w->next;
369             }
370         }
371     }
372     for (j = 1; j < 11; j++)
373     {
374         if (topw[j]->number)
375             fprintf(fp,"\n%s :%d", topw[j]->next->words, topw[j]->number);
376     }
377     for (Char_Phrase::iterator it = phrasemap.begin(); it != phrasemap.end(); it++)
378     {
379         toph[11]->one = it->first.one;
380         toph[11]->two = it->first.two;
381         toph[11]->num = it->second;
382         j = 11;
383         while (j > 1 && toph[j]->num > toph[j - 1]->num)
384         {
385             toph[0] = toph[j];
386             toph[j] = toph[j - 1];
387             toph[j - 1] = toph[0];
388             j--;
389         }
390     }
391     fprintf(fp, "\n");
392     for (j = 1; j < 11; j++)
393     {
394         if (toph[j]->num)
395             fprintf(fp,"\n%s %s :%d", toph[j]->one, toph[j]->two, toph[j]->num);
396     }
397     fclose(fp);
398     return 1;
399 }
View Code
posted @ 2018-04-01 23:17 ustczwq 阅读(319) 评论(3) 收藏举报
刷新页面返回顶部
ustczwq

记一个男默女泪的 BUG

姗姗来迟的词频统计代码 BUG 的发现

1. 此前提交的第一次代码作业总结博客

2. BUG 本天成，妙手偶得之

3. 加了几个等于号之后的源代码

公告