1 /*
  2  * 该程序用于计算语言的核心项集
  3  * RexfieldVon
  4  * 2013年8月24日21:19:25
  5  */
  6 #include <stdio.h>
  7 #include <stdlib.h>
  8 #include <string.h>
  9 #include <assert.h>
 10 
 11 #ifndef bool
 12 #    define bool char
 13 #endif
 14 
 15 #ifndef true
 16 #    define true 1
 17 #endif
 18 
 19 #ifndef false
 20 #    define false 0
 21 #endif
 22 
 23 #define NEXTSIZE 256
 24 
 25 struct TrieTreeNode
 26 {
 27     struct TrieTreeNode *Next[NEXTSIZE];
 28     bool Accepted;
 29 };
 30 
 31 struct TrieTreeRoot
 32 {
 33     int NodeCount;
 34     struct TrieTreeNode *Tree;
 35 };
 36 
 37 struct Collection
 38 {
 39     char *Expression;                // 产生式
 40     struct Collection *next;
 41 };
 42 struct CoreCollection
 43 {
 44     struct Collection *S;            // 项集
 45     int id;                            // 项集序号
 46     bool marked;                    // 是否被处理
 47     unsigned char *FeatureString;    // 特征字串
 48     int FeatureStringLength;        // 特征字串长度
 49     unsigned int FeatureHash;        // 特征哈希
 50     struct CoreCollection *next;
 51 };
 52 struct Record
 53 {
 54     int RecordRow;                // 当前最大项位
 55     int RecordRowMax;            // 最大分配项数
 56     int **Record;                // 记录指针
 57 };
 58 /* 三级指针
 59  * 第一级指向整个产生式组
 60  * 第二级指向单个产生式
 61  * 第三级指向产生式符号单元
 62  * 约定:①所有的大写字母为非终结符②所有小写字母为终结符③'\377'为eof④'\0'为ε⑤'\376'为占位符·
 63  */
 64 char*** GrammerRule;
 65 /*
 66  * 文法书写约定:
 67  * 每个字符串表示一个单独的产生式
 68  * 第一个字符为产生式左边的非终结符,由初始化引擎进行产生式归并
 69  * 整个文法以 null 结束
 70  */
 71 char *Grammer[] =
 72 {
 73     "GL",
 74     "LLP", "LP",
 75     "P(P)", "P()",
 76     "\0"
 77 };
 78 
 79 /*
 80  * 构建 Trie 树并初始化
 81  * 返回一个新的 Trie 根节点
 82  */
 83 struct TrieTreeRoot *BuildTrieTree()
 84 {
 85     struct TrieTreeRoot *Root = (struct TrieTreeRoot *)malloc(sizeof(struct TrieTreeRoot));
 86     Root->NodeCount = 1;
 87     Root->Tree = (struct TrieTreeNode *)malloc(sizeof(struct TrieTreeNode));
 88     memset(Root->Tree, '\0', sizeof(struct TrieTreeNode));
 89     return Root;
 90 }
 91 
 92 /*
 93  * 插入新的字符串
 94  * Root : struct TrieTreeRoot* 要操作的 Trie 树根节点
 95  * Item : char* 要插入的字符串
 96  */
 97 void InsertItem(struct TrieTreeRoot *Root, char *Item)
 98 {
 99     struct TrieTreeNode *Ptr = Root->Tree;
100     int index = 0;
101     unsigned char Charactor;
102 
103     while ((Charactor = Item[index]) != '\0')
104     {
105         if (Ptr->Next[Charactor] == NULL)
106         {
107             Ptr->Next[Charactor] = (struct TrieTreeNode *)malloc(sizeof(struct TrieTreeNode));
108             memset(Ptr->Next[Charactor], '\0', sizeof(struct TrieTreeNode));
109             Root->NodeCount++;
110         }
111         Ptr = Ptr->Next[Charactor];
112         index++;
113     }
114 
115     Ptr->Accepted = true;
116 }
117 
118 /*
119  * 递归序列化 Trie 树
120  * Node : struct TrieTreeNode* 当前操作的 Trie 节点
121  * WritePtr : unsigned char* 特征串写入指针
122  */
123 unsigned char *DoFeature(struct TrieTreeNode *Node, unsigned char *WritePtr)
124 {
125     int i, count = 0;
126     unsigned char *ErgodicPtr;
127 
128     *WritePtr = (unsigned char)Node->Accepted;    // 写入节点是否接受
129     WritePtr++;
130 
131     ErgodicPtr = WritePtr;                // 记录集合起始地址
132 
133     for (i = 0; i < NEXTSIZE; i++)        // 将该组记录写入特征串
134     {
135         if (Node->Next[i] != NULL)
136         {
137             *WritePtr = (char)i;
138             WritePtr++;
139             count++;
140         }
141     }
142 
143     *WritePtr = '\0';                    // 写入组分隔符
144     WritePtr++;
145 
146     for (i = 0; i < count; i++)            // 递归调用处理所有边
147     {
148         WritePtr = DoFeature(Node->Next[ErgodicPtr[i]], WritePtr);
149     }
150 
151     return WritePtr;
152 }
153 
154 /*
155  * 取得 Trie 的特征串,即序列化 Trie 树
156  * Root : struct TrieTreeRoot* 要操作的 Trie 树根节点
157  * StringLength : int* 长度指针(为了返回二进制串而设置)
158  */
159 unsigned char *GetFeatureString(struct TrieTreeRoot *Root, int *StringLength)
160 {
161     struct TrieTreeNode *Ptr = Root->Tree;
162     // 假设最坏情况下,每个节点只有一条边,那么存储该节点就需要三个单元(Accepted、边、分隔符)
163     // 但实际上真正用到的只有 3N-1 个字节
164     unsigned char *FeatureString = (unsigned char *)malloc(Root->NodeCount * 3);
165     unsigned char *WritePtr = FeatureString;
166 
167     WritePtr = DoFeature(Ptr, WritePtr);
168 
169     *StringLength = WritePtr - FeatureString;
170     return FeatureString;
171 }
172 
173 /*
174  * 初始化文法序列
175  */
176 void InitizationGrammerRule()
177 {
178     // 分配表头空间
179     GrammerRule = (char***)malloc(sizeof(int) * 128);
180     memset(GrammerRule, '\0', sizeof(int) * 128);
181     // 扫描整个文法记录每个非终结符产生式的个数
182     int UnterminalOp[127], index;
183     unsigned char Unterminal;
184     memset(UnterminalOp, '\0', 4 * 127);
185     for (index = 0; (Unterminal = Grammer[index][0]) != '\0'; index++)
186     {
187         UnterminalOp[Unterminal]++;
188     }
189     // 写入产生式
190     for (index = 0; (Unterminal = Grammer[index][0]) != '\0'; index++)
191     {
192         if(GrammerRule[Unterminal] == NULL)
193         {
194             GrammerRule[Unterminal] = (char**)malloc(sizeof(int) * (UnterminalOp[Unterminal] + 1));
195             memset(GrammerRule[Unterminal], '\0', sizeof(int) * (UnterminalOp[Unterminal] + 1));
196         }
197         // 找到空位
198         int blank = 0;
199         while (GrammerRule[Unterminal][blank] != '\0') {blank++;}
200         GrammerRule[Unterminal][blank] = &Grammer[index][1];
201     }
202 }
203 
204 /*
205  * 取得终结符数量
206  * return 终结符的数量
207  */
208 int GetTerminalCount()
209 {
210     int i, TerminalCount = 0;
211     for (i = 0; i < 128; i++)
212     {
213         if (GrammerRule[i] != NULL)
214         {
215             int k = 0;
216             while (GrammerRule[i][k] != NULL)
217             {
218                 int n = 0;
219                 while (GrammerRule[i][k][n] != '\0')
220                 {
221                     char c = GrammerRule[i][k][n];
222                     if (c < 'A' || c > 'Z')
223                     {
224                         TerminalCount++;
225                     }
226                     n++;
227                 }
228                 k++;
229             }
230         }
231     }
232     return TerminalCount;
233 }
234 
235 /*
236  * 递归取得 FIRST 集
237  * Token : unsigned char 需要打印的符号
238  * FIRST : char* FIRST集
239  * Ptr : int* FIRST集的位置指针
240  */
241 void GetFIRST(unsigned char Token, char *FIRST, int *Ptr)
242 {
243     if (Token >= 'A' && Token <= 'Z' && GrammerRule[Token] != NULL)
244     {
245         int i = 0;
246         while (GrammerRule[Token][i] != NULL)
247         {
248             GetFIRST(GrammerRule[Token][i++][0], FIRST, Ptr);
249         }
250     }
251     else if (Token < 'A' || Token > 'Z')
252     {
253         FIRST[*Ptr] = Token;
254         *Ptr = *Ptr + 1;
255     }
256 }
257 
258 /*
259  * 打印 LR(1) 项
260  * Item : struct Collection* 需要打印的项
261  */
262 void PrintItem(struct Collection *Item)
263 {
264     printf("[%c ->", Item->Expression[0]);
265     int i = 1;
266     for(; Item->Expression[i + 1] != '\0'; i++)
267     {
268         printf(" ");
269         switch (Item->Expression[i])
270         {
271             case '\377':
272                 printf("<eof>");
273                 break;
274             case '\376':
275                 printf("<@>");
276                 break;
277             default:
278                 printf("%c", Item->Expression[i]);
279                 break;
280         }
281     }
282     if (Item->Expression[i] == '\377')
283     {
284         printf(", <eof>]");
285     }
286     else
287     {
288         printf(", %c]", Item->Expression[i]);
289     }
290 }
291 
292 /*
293  * 打印项集
294  * Item : struct Collection* 需要打印的项集
295  */
296 void PrintCollections(struct Collection *S)
297 {
298     printf("-------- Collection ---------\n");
299     for (; S != NULL; S = S->next)
300     {
301         PrintItem(S);
302         printf("\n");
303     }
304     printf("-----------------------------\n");
305 }
306 
307 /*
308  * 添加项到集合
309  * S : struct Collection* 项集
310  * Tail : struct Collection* 尾部指针
311  * LeftUnterminal : char 左非终结符
312  * Expression : char* 产生式
313  * PreviewSymbol : char 前瞻符号
314  */
315 void AddItem(struct Collection *S, struct Collection **Tail, char *Expression)
316 {
317     if (Tail == NULL) {Tail = (struct Collection **)malloc(sizeof(struct Collection **)); (*Tail) = NULL;}
318     if ((*Tail) == NULL) {(*Tail) = S;}
319     while ((*Tail)->next != NULL) {(*Tail) = (*Tail)->next;}
320     // 检查是否重复
321     struct Collection *SPtr = S;
322     for (; SPtr != NULL; SPtr = SPtr->next)
323     {
324         if (SPtr->Expression != NULL &&
325             Expression != NULL &&
326             strcmp(SPtr->Expression, Expression) == 0)
327         {
328             return;
329         }
330     }
331     struct Collection *NewItem = (struct Collection*)malloc(sizeof(struct Collection));
332     NewItem->Expression = strdup(Expression);
333     NewItem->next = NULL;
334     (*Tail)->next = NewItem;
335     (*Tail) = (*Tail)->next;
336 }
337 
338 /*
339  * 闭包运算
340  * S : struct Collection* 项集
341  * TerminalCount : int 终结符个数
342  */
343 void Closure(struct Collection *S, int TerminalCount)
344 {
345     bool CollectChanged;
346     struct Collection *Ptr = S, *Tail = S;
347     do        // while (S is still changing)
348     {
349         CollectChanged = false;
350         while (Ptr != NULL)        // for each item [A->β·Cζ,α]∈S
351         {
352             char *Placeholder = strchr(Ptr->Expression, '\376');
353             if (Placeholder != NULL &&
354                 *(Placeholder + 2) != '\0' &&
355                 *(Placeholder + 1) != '\0')        // 占位符不能在产生式尾(= =)更不能在前瞻符号的位置上(= =#)!
356             {
357                 unsigned char Unterminal = *(Placeholder + 1);
358                 if (Unterminal >= 'A' && Unterminal <= 'Z')
359                 {
360                     int ProductionIndex;
361                     for (ProductionIndex = 0; GrammerRule[Unterminal][ProductionIndex] != NULL; ProductionIndex++)    // for each production C->γ∈P
362                     {
363                         char *FIRST = (char*)malloc(TerminalCount + 1), FirstSymbol = *(Placeholder + 2);
364                         memset(FIRST, '\0', TerminalCount + 1);
365                         int FIRSTCount = 0, i;
366                         GetFIRST(FirstSymbol, FIRST, &FIRSTCount);
367                         for (i = 0; i < FIRSTCount; i++)        // for each b∈FIRST(ζα)
368                         {
369                             if (FIRST[i] != '\0')               // S <- S∪{[C->·γ,b]}
370                             {
371                                 char *Expr, *GRExpr = GrammerRule[Unterminal][ProductionIndex];
372                                 int GRExprLength = strlen(GRExpr);
373                                 Expr = (char*)malloc(2 + GRExprLength + 1 + 1);
374                                 Expr[0] = Unterminal;
375                                 Expr[1] = '\376';
376                                 memcpy(Expr + 2, GRExpr, GRExprLength);
377                                 Expr[2 + GRExprLength + 1 - 1] = FIRST[i];
378                                 Expr[2 + GRExprLength + 1 + 1 - 1] = '\0';
379                                 AddItem(S, &Tail, Expr);
380                                 CollectChanged = true;
381                             }
382                         }
383                     }
384                 }
385             }
386             Ptr = Ptr->next;
387         }
388     }
389     while (CollectChanged == true);
390 }
391 
392 /*
393  * Goto 运算
394  * S : struct Collection* 项集
395  * Symbol : char 前瞻符号
396  * TerminalCount : int 终结符个数
397  */
398 struct Collection *Goto(struct Collection *S, char Symbol, int TerminalCount)
399 {
400     // moved <- 空集
401     struct Collection *Moved = (struct Collection*)malloc(sizeof(struct Collection));
402     memset(Moved, '\0', sizeof(struct Collection));
403     struct Collection *Tail = Moved;
404     while (S != NULL)    // for each item i∈S
405     {
406         char *Placeholder = strchr(S->Expression, '\376');
407         if (Placeholder != NULL && *(Placeholder + 1) == Symbol)    // if the form of i is [α->β·xζ,a] then
408         {
409             char *Expr = strdup(S->Expression);
410             Placeholder = strchr(Expr, '\376');
411             *Placeholder = Symbol;
412             *(Placeholder + 1) = '\376';
413             AddItem(Moved, &Tail, Expr);    // moved <- moved∪{[α->βx·ζ,a]}
414         }
415         S = S->next;
416     }
417     struct Collection *FreeNode = Moved;
418     Moved = Moved->next;
419     free(FreeNode);
420     Closure(Moved, TerminalCount);    // return closure(moved)
421     return Moved;
422 }
423 
424 /*
425  * 可以计算字串的 ELFHash
426  * str : unsigned char* 字串
427  * length : int 字串长度
428  */
429 unsigned int ELFHash_Bin(unsigned char *str, int length)
430 {
431     int i = 0;
432     unsigned int hash = 0, x = 0;
433     while (i < length)
434     {
435         hash = (hash << 4) + (str[i++]);
436         if ((x = hash & 0xF0000000L) != 0)
437         {
438             hash ^= (x >> 24);
439             hash &= ~x;
440         }
441     }
442     return (hash & 0x7FFFFFFF);
443 }
444 
445 /*
446  * 完成特征值计算
447  * CC : struct CoreCollection* 要计算特征值的核心项集
448  */
449 void CompleteFeature(struct CoreCollection *CC)
450 {
451     struct TrieTreeRoot *TrieRoot = BuildTrieTree();
452     struct Collection *SPtr;
453     for (SPtr = CC->S; SPtr != NULL; SPtr = SPtr->next)
454     {
455         InsertItem(TrieRoot, SPtr->Expression);
456     }
457     CC->FeatureString = GetFeatureString(TrieRoot, &CC->FeatureStringLength);
458     CC->FeatureHash = ELFHash_Bin(CC->FeatureString, CC->FeatureStringLength);
459 }
460 
461 /*
462  * 检查核心项集是否存在,并返回项集 ID
463  * CC : struct CoreCollection* 核心项集
464  * S : struct CoreCollection* 待检测的项集
465  */
466 int CollectionExist(struct CoreCollection *CC, struct CoreCollection *S)
467 {
468     // 计算集合 S 的特征码
469     CompleteFeature(S);
470     // 开始逐个比较特征
471     struct CoreCollection *CCPtr = CC;
472     for (; CCPtr != NULL; CCPtr = CCPtr->next)
473     {
474         if (CCPtr->FeatureString == NULL ||
475             CCPtr->FeatureHash == 0 ||
476             CCPtr->FeatureStringLength == 0)
477         {
478             CompleteFeature(CCPtr);
479         }
480         if (CCPtr->FeatureHash == S->FeatureHash &&
481             CCPtr->FeatureStringLength == S->FeatureStringLength &&
482             memcmp(CCPtr->FeatureString, S->FeatureString, S->FeatureStringLength) == 0)
483         {
484             return CCPtr->id;
485         }
486     }
487     return -1;
488 }
489 
490 /*
491  * 添加项集到核心项集
492  * CC : struct CoreCollection* 核心项集
493  * Tail : struct CoreCollection** 核心项集的尾部指针
494  * S : struct Collection* 待添加的项集
495  * CCid : int 上一个核心项集的 ID
496  */
497 int AddCoreCollection(struct CoreCollection *CC, struct CoreCollection **Tail, struct Collection *S, int CCid)
498 {
499     if (Tail == NULL) {Tail = (struct CoreCollection **)malloc(sizeof(struct CoreCollection **)); (*Tail) = NULL;}
500     if ((*Tail) == NULL) {(*Tail) = CC;}
501     while ((*Tail)->next != NULL) {(*Tail) = (*Tail)->next;}
502 
503     struct CoreCollection *CCItem = (struct CoreCollection*)malloc(sizeof(struct CoreCollection));
504     CCItem->id = CCid + 1;
505     CCItem->marked = false;
506     CCItem->S = S;
507     CCItem->next = NULL;
508 
509     int id = CollectionExist(CC, CCItem);
510     if (id == -1)        // if temp!∈CC
511     {
512         id = CCItem->id;
513         (*Tail)->next = CCItem;                        // CC <- {CC0}
514         (*Tail) = (*Tail)->next;
515     }
516     return id;
517 }
518 
519 /*
520  * 记录 Goto[CCi, symbol]->CCj
521  * RecordTable : struct Record* 记录表
522  * CCi : int 当前项集 ID
523  * Symbol : unsigned char 转移符号
524  * CCj : int 转移目的项集 ID
525  */
526 void Record(struct Record *RecordTable, int CCi, unsigned char Symbol, int CCj)
527 {
528     // [CCi, Symbol] -> CCj
529     if (RecordTable->RecordRow < CCi)        // 新请求的位置大于最大项位,需要更新项位
530     {
531         // 一次分配 32 条记录空间
532         if (RecordTable->RecordRowMax <= CCi)    // 新请求的位置超过最大可使用项数,追加新的项表空间
533         {
534             RecordTable->RecordRowMax = ((int)(CCi / 32) + 1) * 32;
535             RecordTable->Record = (int **)realloc(RecordTable->Record, RecordTable->RecordRowMax);
536         }
537         RecordTable->RecordRow = CCi;
538 
539         int *tmp_spc = (int*)malloc(sizeof(int) * 256);
540         memset(tmp_spc, '\0', sizeof(int) * 256);
541         RecordTable->Record[CCi] = tmp_spc;
542     }
543     if (RecordTable->Record[CCi][Symbol] == CCj)
544     {
545         // printf("Find Repeat.\n");
546     }
547     else if (RecordTable->Record[CCi][Symbol] != 0)
548     {
549         printf("Find Conflict.\n");
550     }
551     else
552     {
553         RecordTable->Record[CCi][Symbol] = CCj;
554         printf("[CC%d, %c] -> CC%d\n", CCi, Symbol, CCj);
555     }
556 }
557 
558 /*
559  * 计算 LR 核心项集以及 Goto 表
560  */
561 void LRCollection()
562 {
563     int TerminalCount = GetTerminalCount(), CCid = 0;
564 
565     struct Record *RecordTable = (struct Record *)malloc(sizeof(struct Record));
566     memset(RecordTable, '\0', sizeof(struct Record));
567     RecordTable->RecordRow = -1;
568     RecordTable->RecordRowMax = 32;
569     RecordTable->Record = (int **)malloc(sizeof(int) * 32);
570 
571     struct Collection *S = (struct Collection*)malloc(sizeof(struct Collection));
572     memset(S, '\0', sizeof(struct Collection));
573     S->Expression = strdup("G\376L\377");
574     S->next = NULL;
575     Closure(S, TerminalCount);        // CC0 <- closure({[S -> · S', eof]})
576 
577     struct CoreCollection *CC = (struct CoreCollection*)malloc(sizeof(struct CoreCollection)), *CCPtr, *CCTail;
578     CC->id = 0;
579     CC->marked = false;
580     CC->S = S;                        // CC <- {CC0}
581     CC->next = NULL;
582     CompleteFeature(CC);
583     CCTail = CC;
584 
585     for (CCPtr = CC; CCPtr != NULL; CCPtr = CCPtr->next)        // while (new sets are still being added to CC)
586     {
587         if (CCPtr->marked == false)        // for each unmarked set CCi∈CC
588         {
589             CCPtr->marked = true;        // mark CCi as processed
590             struct Collection *ExprPtr = NULL;
591             for (ExprPtr = CCPtr->S; ExprPtr != NULL; ExprPtr = ExprPtr->next)        // for each x following a · in an item in CCi
592             {
593                 char *Placeholder = strchr(ExprPtr->Expression, '\376');
594                 if (Placeholder != NULL && *(Placeholder + 1) != '\0' && *(Placeholder + 2) != '\0')
595                 {
596                     unsigned char PrevSym = *(Placeholder + 1);
597                     struct Collection *temp = Goto(CCPtr->S, PrevSym, TerminalCount);    // temp <- goto(CCi, x)
598                     int temp_id = AddCoreCollection(CC, &CCTail, temp, CCid);
599                     if (temp_id > CCid)
600                     {
601                         printf("Goto(CC%d, %c):\n", CCPtr->id, PrevSym);
602                         PrintCollections(temp);
603                         CCid++;        // 意味着新的 CCID 被分配
604                     }
605                     // record transition form CCi to temp on X
606                     Record(RecordTable, CCPtr->id, PrevSym, temp_id);
607                     printf("\n");
608                 }
609             }
610         }
611     }
612 }
613 
614 int main(int argc, char **argv)
615 {
616     InitizationGrammerRule();    // 初始化文法
617 
618     LRCollection();
619     return 0;
620 }

 

posted on 2013-08-24 21:39  RexfieldVon  阅读(583)  评论(0编辑  收藏  举报