[Compiling Principles] LEX基本功能的实现

学习编译原理的时候做的一个小型LEX,及词法分析产生器,可以根据输入的正则表达式生成自动机,从中识别出代码中的token串,存入一个二维表,也就是SymbolTable。

主体代码如下:

代码
  1 #include<iostream>
  2 #include "stdio.h"
  3 using namespace std;
  4 #include<String>
  5 #include<Stack>
  6 #include<set>
  7 #include<queue>
  8 #include<fstream>
  9 #include<iomanip>
 10 struct path
 11 {
 12     int from;                  //一个path代表从一个状态到另一个状态的路径
 13     int to;
 14     string strPath;            //strPath代表这个结构体处理的路径
 15 };
 16 struct dealStruct
 17 {
 18     int stCount;
 19     struct path* resPtr; 
 20 };
 21 struct setNode
 22 {
 23     set<int> *Dset;
 24     int num;
 25     struct setNode * next;   
 26 };
 27 bool judgeLD( char v )                      //判断正则表达式中是否含有非法字符
 28 {
 29     if(  ( int(v) > 47 &&  int(v) <= 57 ) || 
 30          ( int(v) >= 65 &&  int(v) <= 90 )|| 
 31          ( int(v) >= 97 &&  int(v) <= 122 ) ||
 32            v == '>' || v == '<' || v == '=' ||
 33            v == '`' || v == '+' || v == '-' )
 34         return true;
 35     else
 36         return false;
 37 }
 38 struct dealStruct * dealWithStr( stringint ); //通过正则表达式生成NFA
 39 int jihe( int, stack<struct path*> & );         //处理正则表达式时'*’的运算函数
 40 int lianjie( int, stack<struct path*> & );      //处理正则表达式时'&’(自定义运算符ab相当于a&b)的运算函数 
 41 int huo( int, stack<struct path*> & );;         //处理正则表达式时'|’的运算函数 
 42 int reduceNFA( intint*int );                //NFA->DFA函数 
 43 void BFS( intint );                           //计算一个状态的ξ闭包 
 44 void produceC( intstring * );                 //生成词法分析器的.cpp文件 
 45 int arrForBFS[100= {0};                       //存放BFS函数生成的闭包的数组 
 46 string T[2000][2000];                           //NFA的数组表示 
 47 string recT[2000][2000];                        //DFA的数组表示
 48 int stateCountDFA;                              //DFA状态数量
 49 int reduceState[2][2000];                       //化简DFA用的数组
 50 set<char> tv;                                   //所有终结符
 51 set<char>::iterator Vt;
 52 void reduceDFA();                               //化简DFA
 53 void combine(int,int);                          //化简DFA时合并两个状态
 54 int regCount;                                   //输入正则表达式的个数
 55 
 56 int main()
 57 {
 58     regCount = 0;
 59     int i,j,k;
 60     int temp = 0;
 61     
 62     string * regArr;
 63     string * typeName;
 64     int * typeIndex;
 65     int **resultEnterExt;
 66 
 67 
 68     ifstream inRegular;
 69     inRegular.open("RegularExpressions.txt");
 70     inRegular>>regCount;                          //正则表达式个数
 71     typeName = new string[regCount];
 72     typeIndex = new int[regCount];
 73     resultEnterExt = new int*[regCount];      //定义一个二位动态数组用于存放每个正则表达式所生成的NFA的起始状态和终结状态 
 74     for( i = 0;i < regCount;i++ )
 75         resultEnterExt[i] = new int[2];
 76     regArr = new string[regCount];
 77     for( i = 0;i < regCount;i++ )
 78     {
 79         inRegular>>typeName[i];
 80         inRegular>>regArr[i];
 81     }                                           //读入正则表达式
 82     
 83     int stateCount = -1
 84     for( i = 0;i < regCount;i++ )                 //遍历每个正则表达式
 85     {
 86         for( j = 0;j < regArr[i].length()-1;j++ )
 87            if(   (  judgeLD(regArr[i].at(j) ) && (  judgeLD(regArr[i].at(j+1)) || regArr[i].at(j+1== '(' )  ) ||
 88                  (  regArr[i].at(j) == ')'  &&   ( regArr[i].at(j+1== '(' || judgeLD(regArr[i].at(j+1)) )   ) ||
 89                  (  regArr[i].at(j) == '*'  &&    ( regArr[i].at(j+1== '(' || judgeLD(regArr[i].at(j+1)) )  )   )
 90               regArr[i].insert(j+1,"&");           //预处理正则表达式,添加自定义运算符'&' 
 91 
 92                  
 93         regArr[i].insert(regArr[i].length(),"#%");  //在末尾添加结束符 
 94         cout<<regArr[i]<<endl;                      //输出处理后的正则表达式 
 95         
 96         
 97         j = 0;
 98         struct path *= NULL;
 99         struct dealStruct * result = dealWithStr( regArr[i],stateCount );  //对于每个正则表达式调用NFA生成函数 
100 
101         stateCount = result->stCount;
102         resultEnterExt[i][0= result->resPtr->from;    //记录该正则表达式的起始状态和终结状态 
103         resultEnterExt[i][1= result->resPtr->to;
104         typeIndex[i] = resultEnterExt[i][1];
105 /*        cout<<"The number of states now is "<<stateCount<<endl;
106         cout<<"The regular perform dealt with just now is "<<result->resPtr->strPath<<endl;
107         cout<<"The from dealt with just now is "<<resultEnterExt[i][0]<<endl;
108         cout<<"The to dealt with just now is "<<resultEnterExt[i][1]<<endl;*/
109     }
110     
111     stateCount++;                                  //汇总所有正则表达式的NFA 
112     cout<<"NFA合并完毕!"<<endl;
113                                   
114     for( i = 0;i < regCount;i++ )
115          T[stateCount][resultEnterExt[i][0]] = '`';      //从公共起始状态出发 
116 
117   /*  for( i = 0;i <= stateCount;i++ )       //输出产生的NFA以检验 
118     {
119         for( j = 0;j <= stateCount;j++ )
120         {
121              if( T[i][j] == "" ) cout<<" |";
122              else cout<<T[i][j]<<'|';
123         }
124         cout<<endl;
125     }*/
126 
127     stateCountDFA = reduceNFA( stateCount, typeIndex, regCount ); //调用NFA->DFA函数生成DFA 
128     reduceDFA();
129     
130     produceC( stateCountDFA, typeName );                           //产生.cpp代码
131 
132     system("pause");
133     
134     return 0;
135 }
136 
137 /*以下为NFA的生成函数,采用三个栈实现Thompson算法,第一个栈用于读入字符和运算符
138  第二个栈用于逆序,第三个栈专门用于处理一对括号内部的表达式,其实用两个栈就可以
139  实现,及把第三个栈舍弃,直接把括号的计算放在第一个栈中进行也可。为了计算更加
140  清楚不易出错故而多加了一个栈*
141  算法思路:读入正则表达式放入栈A中,*,&,|的优先级分别定为3,2,1,而表达式末
142  尾的'#'优先级最低,看做0。用一个三元数组存放当各个运算符的个数,字符均放入栈A中,
143  读到运算符时,当栈外的运算符的优先级大于等于站内所有运算符时,该运算符进栈,
144  如果低于所有站内运算符,计算站内所有运算符表的运算,直到栈内没有比栈外更高优先
145  级的运算符。
146  读到(的时候,(进栈,之后不论读到什么均进栈直到读到第一个),将这对()中的字符串 
147  提出,用栈B逆序后放入栈C,在栈C中按上述方式处理直到只有一个元素,将元素返回栈A
148  栈中存储的是指向path结构体的指针*/
149 struct dealStruct * dealWithStr( string str, int stateCountV )
150 {
151     int j = 0;int temp = 0;
152     struct path *= NULL;
153     int currentPri[3= {0};       //用于储存栈中三种运算符(*,&,|)的个数,便于计算
154     stack<struct path *> stackA;
155     while( str.at(j) != '%' )
156     { 
157             if(  judgeLD(str.at(j))  )   //如果是字符,放入栈A中
158             {
159                  tv.insert(str.at(j));   //放入tv中
160                  stateCountV++;                     
161                  temp = stateCountV;
162                  stateCountV++;
163                  T[temp][stateCountV] += str.at(j);
164                  p = new path();
165                  p->from = temp;
166                  p->to = stateCountV;
167                  p->strPath += str.at(j);
168                  stackA.push(p);
169                  //cout<<"push a new letter "<<p->strPath<<endl;
170                  j++;             
171             }
172             else if( str.at(j) == '*' )  //如果是'*'由于没有比它优先级更高的了, 
173             {                               //故必然是入栈 
174                 currentPri[0]++;
175                 p = new path();
176                 p->from = -1;
177                 p->to = -1;
178                 p->strPath += "*";
179                 stackA.push(p);
180                 //cout<<"push *"<<endl;
181                 j++;
182             }
183             else if( str.at(j) == '&' )  //如果是'&',检查之前有没有'*' 
184             {
185                 if( currentPri[0== 0 )  //没有的话,入栈 
186                     currentPri[1]++;
187                 else
188                 {
189                     currentPri[0]--;      //有,计算* 
190                     currentPri[1]++;
191                     
192                     stateCountV = jihe( stateCountV, stackA );      
193                 }
194                 p = new path();
195                 p->from = -2;
196                 p->to = -2;
197                 p->strPath = "&";
198                 stackA.push(p);          //不论有没有'*','&'必然会入栈 
199                 //cout<<"push &"<<endl;  
200                 j++;
201             }
202             else if( str.at(j) == '|' )   //如果是'|',查看前面有没有*,& 
203             {
204                  while( currentPri[0> 0 || currentPri[1> 0 )//一直算直到没有 
205                  {
206                       if( currentPri[0> 0 )
207                       {
208                           currentPri[0]--;
209                           stateCountV = jihe( stateCountV, stackA );
210                       }
211                       else
212                       {
213                           currentPri[1]--;
214                           stateCountV = lianjie( stateCountV, stackA );
215                       }
216                  }
217                  currentPri[2]++;       //不论有没有*,&,最后'|'必然还是要入栈 
218                  p = new path();
219                  p->from = -3;
220                  p->to = -3;
221                  p->strPath = "|";
222                  stackA.push(p);
223                  //cout<<"push |"<<endl;
224                  j++;    
225             }
226             else if( str.at(j) == '#' )   //如果是'#',则计算前面所有的运算
227             {
228                 while( currentPri[0> 0 || currentPri[1> 0 || currentPri[2> 0 )
229                 {
230                       if( currentPri[0> 0 )
231                       {
232                           currentPri[0]--;
233                           stateCountV = jihe( stateCountV, stackA );
234                       }
235                       else if(  currentPri[1> 0  )
236                       {
237                           currentPri[1]--;
238                           stateCountV = lianjie( stateCountV, stackA );
239                       }
240                       else
241                       {
242                           currentPri[2]--;
243                           stateCountV = huo( stateCountV, stackA );
244                       }
245                       
246                 }                
247                 //cout<<"The End, there is one struct path in stackA: "<<stackA.size()<<endl;                
248                 j++;
249                 //cout<<"It is "<<(stackA.top())->strPath<<endl;                
250             }
251             else if( str.at(j) == '(' )           //发现左括号,进入括号处理过程
252             {                 
253                  //cout<<"发现一个最外层括号"<<endl;           
254                  int parenthCount = 1;
255                  bool flag = true;
256                  while( parenthCount > 0 )
257                  {
258                      if( str.at(j) == '(' )
259                      {                         
260                          p = new path();
261                          p->from = -5;
262                          p->to = -5;
263                          p->strPath += "(";
264                          stackA.push(p);
265                          //cout<<"放入一左个括号"<<endl;
266                          if!flag )parenthCount++;
267                          flag = false;
268                      }   
269                      else if( str.at(j) == ')' )              //发现第一个右括号 
270                      {
271                          //cout<<"发现一个内层右括号"<<endl;
272                          parenthCount--;
273                          
274                          stack<struct path *> stackB;            //定义另两个栈 
275                          stack<struct path *> stackC;
276                          
277                          p = new path();
278                          p->from = -6;
279                          p->to = -6;
280                          p->strPath += "#";
281                          stackB.push(p);
282                          while( (stackA.top())->from != -5 )  //栈Apop到第一个( 
283                          {
284                              stackB.push( stackA.top() );  //将栈Apop出的放入B 
285                             // cout<<"stackA弹出一个"<<stackA.top()->strPath<<endl;
286                              stackA.pop();
287                              
288                          }
289                          //cout<<"stackA想弹出一个左小括号"<<stackA.top()->strPath<<endl; 
290                          stackA.pop();                         //去掉左小括号
291                                                 
292                          int pri[3= {0};
293                          while!stackB.empty() )       //开始处理栈B内的表达式 
294                          {
295                               p = stackB.top();
296                               stackB.pop(); 
297                               //cout<<"stackB弹出一个"<<p->from<<endl;
298                               if( p->from >= 0 )
299                                   stackC.push(p);      //用栈C处理栈B的表达式 
300                               else if( p->from == -1 )
301                               {
302                                    pri[0]++;
303                                    stackC.push(p);
304                               }
305                               else if( p->from == -2 )
306                               {
307                                    if( pri[0> 0 )
308                                    {
309                                         pri[0]--;                                       
310                                         stateCountV = jihe( stateCountV, stackC );
311                                    }
312                                    
313                                    stackC.push(p);
314                                    pri[1]++;
315                               }
316                               else if( p->from == -3 )
317                               {
318                                    while( pri[0> 0 || pri[1> 0 )
319                                    {
320                                        if( pri[0> 0 )
321                                        {
322                                            pri[0]--;                                       
323                                            stateCountV = jihe( stateCountV, stackC );
324                                        }
325                                        else
326                                        {
327                                            pri[1]--
328                                            stateCountV = lianjie( stateCountV, stackC );
329                                        }
330                                    
331                                    }
332                                    pri[2]++;
333                                    stackC.push(p);
334                               }
335                               else if( p->from == -6 )
336                               {
337                                    while( pri[0> 0 || pri[1> 0 || pri[2> 0 )
338                                    {
339                                        if( pri[0> 0 )
340                                        {
341                                            pri[0]--;
342                                            stateCountV = jihe( stateCountV, stackC );
343                                        }
344                                        else if( pri[1> 0 )  
345                                        {
346                                            pri[1]--;
347                                            stateCountV = lianjie( stateCountV, stackC );
348                                        }
349                                        else
350                                        {
351                                            pri[2]--;
352                                            stateCountV = huo( stateCountV, stackC );
353                                        }
354                                    }
355                               }
356                               else
357                                   cout<<"括号内含非法字符!"<<endl; 
358                          }
359                          //cout<<"stackC 中此时应该有1个东东"<<stackC.size()<<"它是"<<stackC.top()->strPath<<endl;
360                          
361                          //cout<<"此时的parenthCount为"<<parenthCount<<endl;           
362                          stackA.push(stackC.top());    //处理完成,栈C弹出结果 
363                          stackC.pop();                   //放入栈A 
364                      }
365                      else if( str.at(j) == '*' )
366                      {
367                          p = new path();
368                          p->from = -1;
369                          p->to = -1;
370                          p->strPath += "*";
371                          stackA.push(p);
372                      }
373                      else if( str.at(j) == '&' )
374                      {
375                          p = new path();
376                          p->from = -2;
377                          p->to = -2;
378                          p->strPath += "&";
379                          stackA.push(p);
380                      }
381                      else if( str.at(j) == '|' )
382                      {
383                          p = new path();
384                          p->from = -3;
385                          p->to = -3;
386                          p->strPath += "|";
387                          stackA.push(p);
388                      }
389                      else
390                      {
391                          stateCountV++;                        
392                          temp = stateCountV;
393                          stateCountV++;
394                          T[temp][stateCountV] += str.at(j);
395                          p = new path();
396                          p->from = temp;
397                          p->to = stateCountV;
398                          p->strPath += str.at(j);
399                          stackA.push(p);
400                      }
401                      tv.insert(str.at(j));   //放入tv中
402                      j++;
403                  }
404             }
405             else
406                 cout<<"含有非法字符!"<<endl;
407     }
408     
409     struct dealStruct * res = new dealStruct();
410     res->stCount = stateCountV;
411     res->resPtr = stackA.top();
412     stackA.pop();
413     cout<<"NFA生成完毕!"<<endl;
414     return res;
415 }
416 
417 
418 int jihe( int stateV, stack<struct path*> &stackA )
419 {
420     struct path *= stackA.top();stackA.pop();
421     struct path *= stackA.top();stackA.pop();
422     //cout<<"pop the value for * "<<a->strPath<<endl;
423     //cout<<"pop the sign for * "<<b->strPath<<endl;
424     
425     stateV++;
426     int temp = stateV;
427     stateV++;
428     T[temp][a->from] += "`";
429     T[temp][stateV]  += "`";
430     T[a->to][stateV] += "`";
431     T[a->to][a->from] += "`";
432     
433     struct path *= new path();
434     p->from = temp;
435     p->to = stateV;
436     p->strPath = "(" + a->strPath + ")*";
437     stackA.push(p);
438     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
439     
440     delete a;delete b;
441     return stateV;
442 }
443 int lianjie( int stateV, stack<struct path*> &stackV  )
444 {
445     struct path *= stackV.top();stackV.pop();
446     //cout<<"pop the second value for & "<<a->strPath<<endl;
447     struct path *= stackV.top();stackV.pop();
448     //cout<<"pop the sign for & "<<b->strPath<<endl;
449     struct path *= stackV.top();stackV.pop();    
450     //cout<<"pop the first value for & "<<c->strPath<<endl;
451     T[c->to][a->from] += "`";
452     
453     struct path *= new path();
454     p->from = c->from;
455     p->to = a->to;
456     p->strPath = "(" + c->strPath + ")&(" + a->strPath + ")";
457     stackV.push(p);
458     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
459     delete a;delete b;delete c;
460     
461     return stateV;
462 }
463 int huo( int stateV, stack<struct path*> &stackA  )
464 {
465     struct path *= stackA.top();stackA.pop();
466     struct path *= stackA.top();stackA.pop();
467     struct path *= stackA.top();stackA.pop();
468     //cout<<"pop the first value for | "<<c->strPath<<endl;
469     //cout<<"pop the second value for | "<<a->strPath<<endl;
470     //cout<<"pop the sign for | "<<b->strPath<<endl;
471     
472     stateV++;
473     int temp = stateV;
474     stateV++;
475     T[temp][a->from] += "`";
476     T[temp][c->from] += "`";
477     T[a->to][stateV] += "`";
478     T[c->to][stateV] += "`";
479     
480     struct path *= new path();
481     p->from = temp;
482     p->to = stateV;
483     p->strPath = "(" + c->strPath + ")|(" + a->strPath + ")";
484     stackA.push(p);
485     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
486     delete a;delete b;delete c;
487     
488     return stateV;
489 }
490 
491 int reduceNFA( int stateCountV, int *typeIndexV, int typeNumV )    //stateCountV为NFA的起始状态
492 {
493     int i,j,k;
494     struct setNode *head,*setPtr,*end; //用于构造链表的指针,新状态用链表存储 
495     struct setNode *currentPtr;        //用于遍历链表中元素的指针 
496                                                   
497     set<int>::iterator ite_int;        //每个新的状态为一个set,封装到setNode中 
498     int countNode = 1;
499     setPtr = new setNode(); 
500     setPtr->num = 1;
501     setPtr->Dset = new set<int>();
502     setPtr->next = NULL;  
503     BFS(stateCountV, stateCountV);
504     for( i = 0;arrForBFS[i] >= 0;i++  )//先把起始状态的ε闭包求出作为新起始状态 
505         setPtr->Dset->insert(arrForBFS[i]);
506              
507     head = setPtr;
508     end = setPtr;
509     currentPtr = head;
510     
511     string letter1[80];                //这里定义了两个数组,一个存放读入的字符 
512     set<int> letter2[80];              //一个存放读入相应字符后所到达的状态集合 
513     int up = 0;
514     
515     
516     while( currentPtr != NULL )            //遍历新状态链表
517     {
518          //cout<<"currentPtr遍历到"<<currentPtr->num<<endl; 
519          //下面一个双重循环,第一个循环表示遍历新状态集合的每一个状态 
520          //第二个循环用于从原NFA找从这个状态出发到达的所有状态 
521          for( ite_int = currentPtr->Dset->begin(); ite_int != currentPtr->Dset->end(); ++ite_int )
522              for( i = 0;i <= stateCountV;i++ )
523                  if( T[*ite_int][i] != "`" && T[*ite_int][i] != "" )
524                  {
525                      bool flag2 = 0;
526                      for( j = 0;j < up;j++ )
527                          if( letter1[j] == T[*ite_int][i] )
528                          {
529                              flag2 = 1;
530                              break;
531                          }
532                      if!flag2 )
533                      {
534                          up++;
535                          letter1[up-1= T[*ite_int][i];  //读入的字符都记录下 
536                      }
537                      BFS( i, stateCountV );
538                      for( k = 0;arrForBFS[k] >= 0;k++ )
539                              letter2[j].insert(arrForBFS[k]);//所到的状态也记下 
540                  }
541          
542          /*cout<<"从新的状态"<<currentPtr->num<<"生成的矩阵是"<<endl;
543          for( i = 0;i < up;i++ )
544          {
545              cout<<letter1[i]<<"  ";
546              for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
547                  cout<<*ite_int<<" ";
548              cout<<endl;
549          }//最后letter1,letter2记录了这个新状态读入不同的字符后会到达的状态合集*/ 
550 
551      
552          for( i = 0;i < up;i++ ) //下面开始比较这些状态是不是以前有过的 
553          {
554               struct setNode *= head;
555               while( p != NULL )
556               {
557                   if( letter2[i] == *(p->Dset) )
558                       break;
559                   p = p->next;
560               }
561               if( p != NULL )   //有这个状态,只要加一条路径即可 
562                   recT[currentPtr->num][p->num] += letter1[i];
563               else       //没有这个状态,要在链表末加一个节点,并在recT中加路径 
564               {
565                   countNode++;
566                   recT[currentPtr->num][countNode] += letter1[i];
567                   setPtr = new setNode();
568                   setPtr->num = countNode;
569                   setPtr->Dset = new set<int>();
570                   for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
571                       setPtr->Dset->insert( *ite_int );
572                   setPtr->next = NULL;
573                   end->next = setPtr;
574                   end = setPtr;
575                   
576                   for( k = 0;k < typeNumV;k++ )
577                   {
578                       ite_int = setPtr->Dset->find(typeIndexV[k]);   
579                       if( ite_int != setPtr->Dset->end() || *(setPtr->Dset->end()) == stateCountV )
580                       {
581                           char temp[10];
582                           string str = itoa(k,temp,10);
583                           recT[setPtr->num][0= "-"+str; //如果是终止状态,做记号
584                           recT[0][setPtr->num] = "-"+str;
585                       }
586                   }
587               }
588          }
589          for( i = 0;i < up;i++ )          //清空letter1,letter2,用于下一轮遍历
590          {
591               letter1[i] = "";
592               letter2[i].clear();
593          }
594          up = 0;
595          currentPtr = currentPtr->next;
596          
597          /*cout<<"此时的链表为"<<endl;
598          struct setNode *p2 = head;
599          while( p2 != NULL )
600               {
601                   cout<<p2->num<<" ";
602                   p2 = p2->next;
603               }
604          cout<<endl;*/
605     }
606     
607     cout<<"DFA生成完毕!"<<endl;
608     /*cout<<"最后DFA为:"<<endl; 
609     for( i = 0;i <= countNode;i++,cout<<endl )
610         for( j = 0; j <= countNode;j++ )
611         { 
612            if( recT[i][j] == "" )cout<<"  |";  
613            else 
614                cout<<setw(2)<<recT[i][j]<<"|";
615         }*/
616     return countNode;
617     
618 }
619 
620 void BFS( int row, int stateCountV )  //广度搜索法求一个状态的ε闭包 
621 {
622     forint i = 0;i < 100;i++ )
623         arrForBFS[i] = -1;
624     int ite = -1;
625     queue<int> que;
626     que.push(row);
627     ite++;
628     arrForBFS[ite] = row;
629     while!que.empty() )
630     {
631         int t = que.front();
632         que.pop();
633         forint i = 0;i <= stateCountV;i++ )
634             if( T[t][i] == "`" )
635             {                
636                 ite++;
637                 arrForBFS[ite] = i;
638                 que.push(i);
639             }
640     }    
641 }
642 
643 void reduceDFA()   //化简DFA算法
644 {
645     /*
646     算法思想:
647     构造二维数组reduceState[2][stateCountDFA]
648     矩阵第一行存储各状态当前所属集合标记
649     矩阵第二行存储个状态读入某个字符后的目标状态所属集合标记
650     每读入某个字符后,矩阵两行相对应位置都相等的状态为同一集合
651     第一行相等而第二行不相等的两状态分到不同集合,即将后一个状态的所属集合标记+1
652     全部字符读完后,扫描矩阵的第一行,即当前所属集合标记,相等的状态合并
653     简化完成
654     */
655     int pos = 1;   //非终结状态所属集合标记
656     int poe;       //终结状态所属集合标记
657     for (int i=1;i<=stateCountDFA;i++//该循环用于给所有状态的所属集合赋初值,终结状态为stateCountDFA,非终结状态为1.
658     {
659         if(recT[0][i].find('-')!=string::npos)
660         {
661             poe=stateCountDFA+atoi(&recT[0][i].at(1));//将不同正则表达式的终结状态标注为不同集合:stateCountDFA+正则表达式序号
662             reduceState[0][i]=poe;
663             reduceState[1][i]=poe;
664         }
665         else
666         {
667             reduceState[0][i]=pos;
668             reduceState[1][i]=pos;
669         }
670     }
671     poe=stateCountDFA+regCount+1;
672     bool flag;  //bool标记,读每个字符,调整集合状态后,重新读入该字符,直到不发生集合改变为止
673     for(Vt=tv.begin();Vt!=tv.end();Vt++)  //读入每个字符的循环
674     {
675         //cout<<*Vt;
676         flag=true;
677         while(flag)  //该循环即为重新读入当前字符的循环
678         {
679             flag = false;
680             bool item = true;  //用于标记某状态读入当前字符是否产生状态变化
681             for(int j=1;j<=stateCountDFA;j++
682             {
683                 item=false;
684                 for(int k=1;k<=stateCountDFA;k++)//循环遍历DFA
685                 {
686                     if(recT[j][k].find(*Vt)!=string::npos)//若当前状态读入该字符产生状态变化
687                     {
688                         item=true;
689                         reduceState[1][j]=reduceState[0][k];//目标集合赋值为目标状态所属集合
690                     }
691                     if(!item)  //不产生状态变化
692                         reduceState[1][j]=0;  //目标集合标记为0
693                 }
694                 
695             }
696             bool b1=false;//标记用于判断pos是否加1
697             bool b2=false;//标记用于判断poe是否加1
698             for(int p=1;p<=stateCountDFA;p++)//循环扫描reduceState矩阵.第一个标记位置
699             {
700                 b1=false;b2=false;
701                 for(int q=2;q<=stateCountDFA;q++)//循环扫描reduceState矩阵.第二个标记位置
702                 {                    
703                     if(reduceState[0][p]==reduceState[0][q]&&reduceState[1][p]!=reduceState[1][q])
704                     { 
705                         //如果存在所属集合相等,目标集合不等的情况,将所属集合分开
706                         if(reduceState[0][q]<stateCountDFA)  //当前状态为非终结状态
707                         {
708                             b1=true;
709                             reduceState[0][q]=pos;
710                         }
711                         else  //当前状态为终结状态
712                         {
713                             b2=true;
714                             reduceState[0][q]=poe;
715                         }
716                         flag=true;  //发生集合调整,标记为true
717                     }                    
718                 }
719                 if(b1)
720                     pos++;
721                 if(b2)
722                     poe++;
723             }
724         }
725     }
726 
727 //    for(int i1=0;i1<stateCountDFA;i1++)
728 //        cout<<reduceState[0][i1]<<" ";
729 //    cout<<endl;
730 //    for(i1=0;i1<stateCountDFA;i1++)
731 //        cout<<reduceState[1][i1]<<" ";
732 
733     for(int j=1;j<stateCountDFA;j++)  //循环扫描reduceState矩阵第一行.第一个标记位置
734     {
735         if(reduceState[0][j]!=0)
736         {
737             for(int k=j+1;k<=stateCountDFA;k++)  //循环扫描reduceState矩阵第一行.第二个标记位置
738             {
739                 if(reduceState[0][j]==reduceState[0][k])//两个标记位置值相等,即所属集合相同
740                     combine(j,k);  //调用合并函数
741             }
742         }
743     }
744     /*cout<<"化简后的DFA为:"<<endl; 
745     for(int m = 0;m <= stateCountDFA;m++,cout<<endl )
746         for(int n = 0; n <= stateCountDFA;n++ )
747         { 
748            if( recT[m][n] == "" )cout<<" |";  
749            else 
750                cout<<recT[m][n]<<"|";
751         }*/
752         
753     cout<<"DFA简化完毕!"<<endl;
754 }
755 
756 void combine(int j,int k)  
757 {
758     /*
759     函数用于合并两个状态
760     把第K状态的相应的行和列的值加到第j状态上
761     并把k状态所有的值置空
762     */
763     for(int i=1;i<=stateCountDFA;i++)
764     {
765         recT[j][i]+=recT[k][i];
766         recT[i][j]+=recT[i][k];
767     }
768     for(i=0;i<=stateCountDFA;i++)
769     {
770         recT[k][i]="";
771         recT[i][k]="";
772     }
773     reduceState[0][k]=0;
774 }
775 
776 void produceC( int nodeCountV, string *typeNameV )       //产生.cpp文件
777 {
778     int i, j ,k;
779     char buf[1024];                //临时保存读取出来的文件内容
780     string message = "";
781     string remainCode = "";
782     ifstream in;
783     in.open("Remain.txt");
784     if(in.is_open())          //文件打开成功,说明曾经写入过东西
785     {
786         while(in.good() && !in.eof())
787         {
788             memset(buf,0,1024);
789             in.getline(buf,1204);
790             message = buf;
791             message += "\n";
792             remainCode += message;
793         }
794         in.close();
795     }
796     
797     ofstream fout("scanner.cpp",ios::out);
798     fout<<remainCode;
799     fout<<"while(flag)\n"<<"{\n";
800     fout<<"switch(state)\n"<<"{\n";
801     for( i = 1;i <= nodeCountV;i++ )   //遍历每个新状态 
802     {
803          fout<<"case "<<i<<":\n";
804          fout<<"last++;\n";
805          fout<<"c = code[last];\n";
806          if( i == 1 )
807          {
808              fout<<"if( c == '%' )\n";  //读到文件末尾,终结 
809              fout<<"{\n";
810              fout<<"flag = 0;\n";
811              fout<<"break;\n}\n";
812          }
813          bool flag3 = 0;bool flag2 = 0;
814          for( j = 1;j <= nodeCountV;j++ )
815               if( recT[i][j] != "'" && recT[i][j] != "" )
816               {
817                   flag3 = 1;                  
818                   if( flag2 )
819                       fout<<"else ";
820                       
821                   fout<<"if( contain( \""<<recT[i][j]<<"\",c ) ) ";
822                   fout<<"state = "<<j<<";\n";
823                   flag2 = 1;
824               }
825          if( flag3 )
826              fout<<"else\n{\n";
827          //fout<<"{\n"<<"endState.find("<<i<<");\n";
828          //fout<<"if( ite_int == endState.find && *(endState.end()) != "<<i<<")\n";
829          if( recT[i][0][0== '-' )
830          {
831              string tempIndexStr = recT[i][0].substr(1,recT[i][0].length()-1);
832              int tempIndex = atoi(tempIndexStr.c_str());
833              string type = typeNameV[tempIndex];
834              fout<<"flag2 = true;\n";
835              fout<<"for( k = 0;k < count;k++ )\n";;
836              fout<<"{\n"<<"if( SymbolTable[k][1] == code.substr(first,last-first) )\n";
837              fout<<"{\nflag2 = false;\nbreak;\n}\n}\n";
838              fout<<"if(flag2)\n{\n";
839              fout<<"count++;\n";
840              fout<<"SymbolTable[count-1][0] = \""<<type<<"\";\n";
841              fout<<"SymbolTable[count-1][1] = code.substr(first,last-first);\n";
842              fout<<"}\n";
843 
844              fout<<"cout<<code.substr(first,last-first)<<endl;\n";
845              fout<<"last--;\n";           
846              fout<<"first = last+1;\n";
847              fout<<"state = 1;\n";
848          }
849          else
850          {
851              fout<<"cout<<\"编译错误!\\n\";\n";
852              fout<<"flag = 0;\n"
853          }
854          if( flag3 )
855              fout<<"}\n";
856          fout<<"break;\n";
857     }
858     fout<<"default:\n"<<"break;\n"<<"}\n}\n";
859     
860     fout<<"cout<<\"Symbol 表如下:\"<<endl;\n";
861     fout<<"cout<<setw(3)<<\"序号\"<<setw(10)<<\"token类型\"<<\"     \"<<\"token名字\"<<endl;\n";
862     fout<<"for( k2 = 0;k2 < count;k2++ )\n";
863     fout<<"{\n"<<"cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<\"     \"<<SymbolTable[k2][1]<<endl;\n";
864     fout<<"}\n";
865     fout<<"system(\"pause\");\n";
866     fout<<"return 0;\n}\n";
867 
868     cout<<"代码生成完毕!见scanner.cpp文件。"<<endl;
869 }

正则表达式在RegularExpressions.txt中写明,示例如下:

4
Identity
a(bc)
*|bc*|d
Number
(
1|2|3|4|5|6|7|8|9|0)*
key
if|else|then
operator
>(`|=)|<(`|=)|=(=|`)

第一行为正则表达式的个数,之后输入表达式代表的类型名,回车,表达式,回车,以此类推。

 remain.txt存放着生成的代码文件中固定的代码,生成代码时会先从此文件读取。

 里面的代码如下:

 

remain.txt
#include<iostream>
using namespace std;
#include
<string>
#include
<fstream>
#include
<set>
#include
<iomanip>
string SymbolTable[1000][2= {""}; 
bool contain( string str, char v )
{
     
forint i = 0;i < str.length();i++ )
         
if( str[i] == v )
             
return true;
     
return false;
}
int main()
{  
    
string tempCode = "";
    
string code = "";
    
    
int i,j,k,k2=0;
    
char buf[1024];                //临时保存读取出来的文件内容
    string message = "";
    ifstream infile;
    infile.open(
"code.txt");
    
if(infile.is_open())          //文件打开成功,说明曾经写入过东西
    {
        
while(infile.good() && !infile.eof())
        {
            memset(buf,
0,1024);
            infile.getline(buf,
1204);
            message 
= buf;
            tempCode 
+= message;
        }
        infile.close();
    }
    
for( i = 0;i < tempCode.length();i++ )
    {
         
if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
         code 
+= tempCode[i];
    }
    code 
+= "%";
    cout
<<code<<endl;
    
    
int first = 0, last = -1;
    
char c;
    
string tempStr = "";
    
int state = 1;
    
bool flag = 1;
    
bool flag2 = true;
    
int count = 0;
    
//set<int> endState;
    
//set<int>::iterator ite_int;
///////////////////////////////////////////保留代码///////////////////////////////////////////////////

表达式读入生成的DFA简化后生成.cpp文件,命名为scanner.cpp

根据上面的那个示例,生成的scanner.cpp文件内容为:

scanner.txt
#include<iostream>
using namespace std;
#include
<string>
#include
<fstream>
#include
<set>
#include
<iomanip>
string SymbolTable[1000][2= {""}; 
bool contain( string str, char v )
{
     
forint i = 0;i < str.length();i++ )
         
if( str[i] == v )
             
return true;
     
return false;
}
int main()
{  
    
string tempCode = "";
    
string code = "";
    
    
int i,j,k,k2=0;
    
char buf[1024];                //临时保存读取出来的文件内容
    string message = "";
    ifstream infile;
    infile.open(
"code.txt");
    
if(infile.is_open())          //文件打开成功,说明曾经写入过东西
    {
        
while(infile.good() && !infile.eof())
        {
            memset(buf,
0,1024);
            infile.getline(buf,
1204);
            message 
= buf;
            tempCode 
+= message;
        }
        infile.close();
    }
    
for( i = 0;i < tempCode.length();i++ )
    {
         
if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
         code 
+= tempCode[i];
    }
    code 
+= "%";
    cout
<<code<<endl;
    
    
int first = 0, last = -1;
    
char c;
    
string tempStr = "";
    
int state = 1;
    
bool flag = 1;
    
bool flag2 = true;
    
int count = 0;
    
//set<int> endState;
    
//set<int>::iterator ite_int;
///////////////////////////////////////////保留代码///////////////////////////////////////////////////
while(flag)
{
switch(state)
{
case 1:
last
++;
= code[last];
if( c == '%' )
{
flag 
= 0;
break;
}
if( contain( "a",c ) ) state = 2;
else if( contain( "b",c ) ) state = 3;
else if( contain( "d",c ) ) state = 4;
else if( contain( "1234567890",c ) ) state = 5;
else if( contain( "i",c ) ) state = 15;
else if( contain( "e",c ) ) state = 16;
else if( contain( "t",c ) ) state = 17;
else if( contain( "><=",c ) ) state = 18;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 2:
last
++;
= code[last];
if( contain( "bb",c ) ) state = 21;
else
{
flag2 
= true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1== code.substr(first,last-first) )
{
flag2 
= false;
break;
}
}
if(flag2)
{
count
++;
SymbolTable[count
-1][0= "Identity";
SymbolTable[count
-1][1= code.substr(first,last-first);
}
cout
<<code.substr(first,last-first)<<endl;
last
--;
first 
= last+1;
state 
= 1;
}
break;
case 3:
last
++;
= code[last];
if( contain( "c",c ) ) state = 3;
else
{
flag2 
= true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1== code.substr(first,last-first) )
{
flag2 
= false;
break;
}
}
if(flag2)
{
count
++;
SymbolTable[count
-1][0= "Identity";
SymbolTable[count
-1][1= code.substr(first,last-first);
}
cout
<<code.substr(first,last-first)<<endl;
last
--;
first 
= last+1;
state 
= 1;
}
break;
case 4:
last
++;
= code[last];
flag2 
= true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1== code.substr(first,last-first) )
{
flag2 
= false;
break;
}
}
if(flag2)
{
count
++;
SymbolTable[count
-1][0= "Identity";
SymbolTable[count
-1][1= code.substr(first,last-first);
}
cout
<<code.substr(first,last-first)<<endl;
last
--;
first 
= last+1;
state 
= 1;
break;
case 5:
last
++;
= code[last];
if( contain( "1121233123444123455551234566666123456777777123456788888881234567899999999123456789000000000",c ) ) state = 5;
else
{
flag2 
= true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1== code.substr(first,last-first) )
{
flag2 
= false;
break;
}
}
if(flag2)
{
count
++;
SymbolTable[count
-1][0= "Number";
SymbolTable[count
-1][1= code.substr(first,last-first);
}
cout
<<code.substr(first,last-first)<<endl;
last
--;
first 
= last+1;
state 
= 1;
}
break;
case 6:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 7:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 8:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 9:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 10:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 11:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 12:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 13:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 14:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 15:
last
++;
= code[last];
if( contain( "f",c ) ) state = 23;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 16:
last
++;
= code[last];
if( contain( "l",c ) ) state = 24;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 17:
last
++;
= code[last];
if( contain( "h",c ) ) state = 25;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 18:
last
++;
= code[last];
if( contain( "===",c ) ) state = 26;
else
{
flag2 
= true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1== code.substr(first,last-first) )
{
flag2 
= false;
break;
}
}
if(flag2)
{
count
++;
SymbolTable[count
-1][0= "operator";
SymbolTable[count
-1][1= code.substr(first,last-first);
}
cout
<<code.substr(first,last-first)<<endl;
last
--;
first 
= last+1;
state 
= 1;
}
break;
case 19:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 20:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 21:
last
++;
= code[last];
if( contain( "c",c ) ) state = 2;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 22:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 23:
last
++;
= code[last];
flag2 
= true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1== code.substr(first,last-first) )
{
flag2 
= false;
break;
}
}
if(flag2)
{
count
++;
SymbolTable[count
-1][0= "key";
SymbolTable[count
-1][1= code.substr(first,last-first);
}
cout
<<code.substr(first,last-first)<<endl;
last
--;
first 
= last+1;
state 
= 1;
break;
case 24:
last
++;
= code[last];
if( contain( "s",c ) ) state = 30;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 25:
last
++;
= code[last];
if( contain( "e",c ) ) state = 31;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 26:
last
++;
= code[last];
flag2 
= true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1== code.substr(first,last-first) )
{
flag2 
= false;
break;
}
}
if(flag2)
{
count
++;
SymbolTable[count
-1][0= "operator";
SymbolTable[count
-1][1= code.substr(first,last-first);
}
cout
<<code.substr(first,last-first)<<endl;
last
--;
first 
= last+1;
state 
= 1;
break;
case 27:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 28:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 29:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 30:
last
++;
= code[last];
if( contain( "e",c ) ) state = 23;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 31:
last
++;
= code[last];
if( contain( "n",c ) ) state = 23;
else
{
cout
<<"编译错误!\n";
flag 
= 0;
}
break;
case 32:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
case 33:
last
++;
= code[last];
cout
<<"编译错误!\n";
flag 
= 0;
break;
default:
break;
}
}
cout
<<"Symbol 表如下:"<<endl;
cout
<<setw(3)<<"序号"<<setw(10)<<"token类型"<<"     "<<"token名字"<<endl;
for( k2 = 0;k2 < count;k2++ )
{
cout
<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<"     "<<SymbolTable[k2][1]<<endl;
}
system(
"pause");
return 0;
}

等待编译的代码写在code.txt中,这个txt相当于代码输入界面,这里的内容对应着上面的那些正则表实例,内容如下:

 

 

if a >= 3
bcc 
= 123;
else if d < 9
abc 
= 12

运行主体代码后,scanner.cpp里的代码编译后的程序会读入code.txt中的代码,再在运行界面上输出token串和Symbol Table,大功告成!

posted on 2010-01-15 19:51  Felix Fang  阅读(493)  评论(0编辑  收藏  举报

导航