C语言——txt词频统计

  1 #define _CRT_SECURE_NO_WARNINGS
  2 #include<stdio.h>
  3 #include<stdlib.h>
  4 #include<string.h>
  5 #define MAXKEY 2000
  6 #define MAX_WORD_SIZE 30
  7 
  8 int hash(char *key) {
  9     int h = 0, g;
 10     while (*key) {
 11         h = (h << 4) + *key++;
 12         g = h & 0xf0000000;
 13         if (g)
 14             h ^= g >> 24;
 15         h &= ~g;
 16     }
 17     return h % MAXKEY;
 18 }
 19 
 20 typedef struct data_count {
 21     int character_count;    //字符数
 22     int line_count;    //行数
 23     int word_count;    //单词数
 24 }Data_c,*pData_c;
 25 
 26 typedef struct words_info {
 27     char word[MAX_WORD_SIZE];    //单词内容
 28     int times;    //单词次数
 29     struct words_info *pNext;
 30 }Word_t,*pWord_t;
 31 
 32 //尾插输入单词
 33 void tailInsert_hashtable_for_words(pWord_t *ppHead,pWord_t *ppTail,char words[]) {
 34     pWord_t pNew = (pWord_t)calloc(1, sizeof(Word_t));
 35     strcpy(pNew->word, words);
 36     if (*ppHead == NULL) {
 37         *ppHead = pNew;
 38         *ppTail = pNew;
 39     }
 40     else {
 41         (*ppTail)->pNext = pNew;
 42         (*ppTail) = pNew;
 43     }
 44 }
 45 
 46 //将单词放进哈希表的链表内,添加新节点或者加次数,用来解决哈希冲突
 47 void putInHashTable(char word[], pWord_t pHashTable[]) {
 48     //哈希表对应位置为空,则单词不存在,添加单词
 49     int hashval = hash(word);
 50     //哈希表内对应hash值为空,添加节点
 51     if (pHashTable[hashval] == NULL) {
 52         pWord_t    pWordNode = (pWord_t)calloc(1, sizeof(Word_t));
 53         strcpy(pWordNode->word, word);
 54         pWordNode->times = 1;
 55         pHashTable[hashval] = pWordNode;
 56     }
 57     else {
 58         //遍历哈希表内对应hash值的链表,找到对应的单词
 59         pWord_t pTemp = pHashTable[hashval];
 60         while (pTemp->pNext != NULL) {
 61             if (!(strcmp(pTemp->word, word))) {
 62                 pTemp->times++;
 63                 return;
 64             }
 65             pTemp = pTemp->pNext;
 66         }
 67         //遍历链表没有对应单词,则末尾添加新节点
 68         pWord_t    pWordNode = (pWord_t)calloc(1, sizeof(Word_t));
 69         strcpy(pWordNode->word, word);
 70         pWordNode->times = 1;
 71         pTemp->pNext = pWordNode;
 72     }
 73 }
 74 
 75 int compare(const void *pleft, const void *pright) {
 76     Word_t *pL = (Word_t *)pleft;
 77     Word_t *pR = (Word_t *)pright;
 78     return pR->times - pL->times;
 79 }
 80 
 81 pWord_t createSortedWordsListFromHashTable(pWord_t pHashTable[]) {
 82     //如果要使用快排,就需要将哈希表的数据转换为连续存储的动态数组。 
 83     int countofWordList = 1;//输出的数组的容量
 84     int currentSize = 0;//当前数组的下标
 85     pWord_t wordList = (pWord_t)calloc(1, sizeof(Word_t));
 86     for (int i = 0; i < MAXKEY; i++) {
 87         //哈希表中为空,通过
 88         if (pHashTable[i] == NULL) {
 89             continue;
 90         }
 91         else {
 92             //遍历存储
 93             pWord_t pCur_InHashTable = pHashTable[i];
 94             while (pCur_InHashTable) {
 95                 currentSize++;
 96                 if (currentSize == countofWordList) {
 97                     countofWordList *= 2;
 98                     //数组扩容
 99                     wordList = (pWord_t)realloc(wordList, countofWordList *sizeof(Word_t));
100                 }
101                 strcpy(wordList[currentSize - 1].word, pCur_InHashTable->word);
102                 wordList[currentSize - 1].times = pCur_InHashTable->times;
103                 pCur_InHashTable = pCur_InHashTable->pNext;
104             }
105         }
106     }
107     qsort(wordList, currentSize, sizeof(Word_t), compare);
108     return wordList;
109 }
110 
111 void calculate_words(char *buf,int count,pData_c dataCount) {    //计算各种信息的个数
112     pWord_t pHashTable[MAXKEY] = { NULL };
113     int if_words = 1;        //判断是否是单词的标志位
114     int start = 0, end = 0;            //单词的起始索引和最后字符的索引
115     for (int i = 0; i < count; i++) {
116         //计算字符个数,排除空格、\t、换行
117         if (buf[i] != ' ' && buf[i] != '\t' &&buf[i] != '\n') {                
118             dataCount->character_count++;
119         }
120         //计算行数,用'\n'\计算
121         if (buf[i] == '\n') {                                            
122             dataCount->line_count++;
123         }
124         //计算单词个数,非字母的后面有字母,则+1
125         if (((buf[i] >= 'a'&&buf[i] <= 'z') || (buf[i] >= 'A'&&buf[i] <= 'Z'))) {
126             if (if_words == 0) {
127                 start = i;
128                 if_words = 1;
129                 end = 0;
130             }
131         }
132         else {
133             if (if_words == 1){ 
134                 //此时字符由之前的字母转到到非字母,判断为一个单词
135                 dataCount->word_count++; 
136                 end = i;
137                 if_words = 0;
138             }
139             else {
140                 continue;
141             }
142         }
143 
144         //将单词记录进hashtable
145         if ((start < end) && (if_words == 0)) {
146             char temp[30];
147             memcpy(temp, &buf[start], end - start);
148             temp[end - start] = '\0';
149             //大写转小写
150             for (int i = 0; i < end - start; i++) {
151                 if (temp[i] >= 'A'&&temp[i] <= 'Z') {
152                     temp[i] += 32;
153                 }
154             }
155             putInHashTable(temp,&pHashTable);
156         }
157     }
158     pWord_t wordList = createSortedWordsListFromHashTable(pHashTable);
159     for (int i = 0; i < 10; i++) {
160         printf("词频第%d高的是%s,出现%d次。\n", i + 1, wordList[i].word, wordList[i].times);
161     }
162 }
163 
164 int main(int args,char* argv[]) {
165     FILE *old_file = fopen(argv[1], "r");
166     if (old_file == NULL) {
167         printf("old_flie errno is %d\n", errno);
168         perror("fopen:");
169         return -1;
170     }
171 
172     int count = 0;                                                                        //统计原文件的字符数,申请合适的buf空间            
173     while (fgetc(old_file) != EOF) {
174         count++;
175     }
176     char *buf = (char *)calloc(1,count * sizeof(char));
177 
178     fseek(old_file,0,SEEK_SET);                                                            //重置前面移动的oldfile指针
179     fread(buf, 1, count, old_file);                                                        //将old_file的内容全部存进buf中
180 
181     Data_c *dataCount = (Data_c *)calloc(1,sizeof(Data_c));
182     //这里统计字符个数,行数,单词个数
183     calculate_words(buf,count,dataCount);
184     fclose(old_file);
185     printf("\n\n\nOutput ended.\n");
186     printf("字符总计%d个,行数总计%d行,单词总计= %d个\n", dataCount->character_count, dataCount->line_count, dataCount->word_count);
187 
188     free(buf);
189 }

 

posted on 2021-02-09 18:02  平ping  阅读(304)  评论(0)    收藏  举报