1 #define _CRT_SECURE_NO_WARNINGS
2 #include<stdio.h>
3 #include<stdlib.h>
4 #include<string.h>
5 #define MAXKEY 2000
6 #define MAX_WORD_SIZE 30
7
8 int hash(char *key) {
9 int h = 0, g;
10 while (*key) {
11 h = (h << 4) + *key++;
12 g = h & 0xf0000000;
13 if (g)
14 h ^= g >> 24;
15 h &= ~g;
16 }
17 return h % MAXKEY;
18 }
19
20 typedef struct data_count {
21 int character_count; //字符数
22 int line_count; //行数
23 int word_count; //单词数
24 }Data_c,*pData_c;
25
26 typedef struct words_info {
27 char word[MAX_WORD_SIZE]; //单词内容
28 int times; //单词次数
29 struct words_info *pNext;
30 }Word_t,*pWord_t;
31
32 //尾插输入单词
33 void tailInsert_hashtable_for_words(pWord_t *ppHead,pWord_t *ppTail,char words[]) {
34 pWord_t pNew = (pWord_t)calloc(1, sizeof(Word_t));
35 strcpy(pNew->word, words);
36 if (*ppHead == NULL) {
37 *ppHead = pNew;
38 *ppTail = pNew;
39 }
40 else {
41 (*ppTail)->pNext = pNew;
42 (*ppTail) = pNew;
43 }
44 }
45
46 //将单词放进哈希表的链表内,添加新节点或者加次数,用来解决哈希冲突
47 void putInHashTable(char word[], pWord_t pHashTable[]) {
48 //哈希表对应位置为空,则单词不存在,添加单词
49 int hashval = hash(word);
50 //哈希表内对应hash值为空,添加节点
51 if (pHashTable[hashval] == NULL) {
52 pWord_t pWordNode = (pWord_t)calloc(1, sizeof(Word_t));
53 strcpy(pWordNode->word, word);
54 pWordNode->times = 1;
55 pHashTable[hashval] = pWordNode;
56 }
57 else {
58 //遍历哈希表内对应hash值的链表,找到对应的单词
59 pWord_t pTemp = pHashTable[hashval];
60 while (pTemp->pNext != NULL) {
61 if (!(strcmp(pTemp->word, word))) {
62 pTemp->times++;
63 return;
64 }
65 pTemp = pTemp->pNext;
66 }
67 //遍历链表没有对应单词,则末尾添加新节点
68 pWord_t pWordNode = (pWord_t)calloc(1, sizeof(Word_t));
69 strcpy(pWordNode->word, word);
70 pWordNode->times = 1;
71 pTemp->pNext = pWordNode;
72 }
73 }
74
75 int compare(const void *pleft, const void *pright) {
76 Word_t *pL = (Word_t *)pleft;
77 Word_t *pR = (Word_t *)pright;
78 return pR->times - pL->times;
79 }
80
81 pWord_t createSortedWordsListFromHashTable(pWord_t pHashTable[]) {
82 //如果要使用快排,就需要将哈希表的数据转换为连续存储的动态数组。
83 int countofWordList = 1;//输出的数组的容量
84 int currentSize = 0;//当前数组的下标
85 pWord_t wordList = (pWord_t)calloc(1, sizeof(Word_t));
86 for (int i = 0; i < MAXKEY; i++) {
87 //哈希表中为空,通过
88 if (pHashTable[i] == NULL) {
89 continue;
90 }
91 else {
92 //遍历存储
93 pWord_t pCur_InHashTable = pHashTable[i];
94 while (pCur_InHashTable) {
95 currentSize++;
96 if (currentSize == countofWordList) {
97 countofWordList *= 2;
98 //数组扩容
99 wordList = (pWord_t)realloc(wordList, countofWordList *sizeof(Word_t));
100 }
101 strcpy(wordList[currentSize - 1].word, pCur_InHashTable->word);
102 wordList[currentSize - 1].times = pCur_InHashTable->times;
103 pCur_InHashTable = pCur_InHashTable->pNext;
104 }
105 }
106 }
107 qsort(wordList, currentSize, sizeof(Word_t), compare);
108 return wordList;
109 }
110
111 void calculate_words(char *buf,int count,pData_c dataCount) { //计算各种信息的个数
112 pWord_t pHashTable[MAXKEY] = { NULL };
113 int if_words = 1; //判断是否是单词的标志位
114 int start = 0, end = 0; //单词的起始索引和最后字符的索引
115 for (int i = 0; i < count; i++) {
116 //计算字符个数,排除空格、\t、换行
117 if (buf[i] != ' ' && buf[i] != '\t' &&buf[i] != '\n') {
118 dataCount->character_count++;
119 }
120 //计算行数,用'\n'\计算
121 if (buf[i] == '\n') {
122 dataCount->line_count++;
123 }
124 //计算单词个数,非字母的后面有字母,则+1
125 if (((buf[i] >= 'a'&&buf[i] <= 'z') || (buf[i] >= 'A'&&buf[i] <= 'Z'))) {
126 if (if_words == 0) {
127 start = i;
128 if_words = 1;
129 end = 0;
130 }
131 }
132 else {
133 if (if_words == 1){
134 //此时字符由之前的字母转到到非字母,判断为一个单词
135 dataCount->word_count++;
136 end = i;
137 if_words = 0;
138 }
139 else {
140 continue;
141 }
142 }
143
144 //将单词记录进hashtable
145 if ((start < end) && (if_words == 0)) {
146 char temp[30];
147 memcpy(temp, &buf[start], end - start);
148 temp[end - start] = '\0';
149 //大写转小写
150 for (int i = 0; i < end - start; i++) {
151 if (temp[i] >= 'A'&&temp[i] <= 'Z') {
152 temp[i] += 32;
153 }
154 }
155 putInHashTable(temp,&pHashTable);
156 }
157 }
158 pWord_t wordList = createSortedWordsListFromHashTable(pHashTable);
159 for (int i = 0; i < 10; i++) {
160 printf("词频第%d高的是%s,出现%d次。\n", i + 1, wordList[i].word, wordList[i].times);
161 }
162 }
163
164 int main(int args,char* argv[]) {
165 FILE *old_file = fopen(argv[1], "r");
166 if (old_file == NULL) {
167 printf("old_flie errno is %d\n", errno);
168 perror("fopen:");
169 return -1;
170 }
171
172 int count = 0; //统计原文件的字符数,申请合适的buf空间
173 while (fgetc(old_file) != EOF) {
174 count++;
175 }
176 char *buf = (char *)calloc(1,count * sizeof(char));
177
178 fseek(old_file,0,SEEK_SET); //重置前面移动的oldfile指针
179 fread(buf, 1, count, old_file); //将old_file的内容全部存进buf中
180
181 Data_c *dataCount = (Data_c *)calloc(1,sizeof(Data_c));
182 //这里统计字符个数,行数,单词个数
183 calculate_words(buf,count,dataCount);
184 fclose(old_file);
185 printf("\n\n\nOutput ended.\n");
186 printf("字符总计%d个,行数总计%d行,单词总计= %d个\n", dataCount->character_count, dataCount->line_count, dataCount->word_count);
187
188 free(buf);
189 }