今天实现了一个递归向下的RegexParser, 可以把正则表达式转化为语法树.
Russ Cox正则表达式解析里面here, 把正则式子首先转化成了逆波兰表达式.
语法树的表达能力要比逆波兰要好一些, 而且树的的后序遍历就是逆波兰式.
大概就是对于每一种运算符, 实施一个parse的子过程, 我之所以考虑使用递归实现, 是考虑到这样写以后扩展的话码代码会方便一些XDD.
下面给出Parser的代码:
1 //Symbol.h 2 #ifndef __SYMBOL_H__ 3 #define __SYMBOL_H__ 4 5 6 enum TokenType 7 { 8 EOF = -1, 9 START = 0,//^ 10 PLUS, //+ 11 STAR,//* 12 BRACKET, //() 13 SQUARE, //[] 14 QUESTION, // ? 15 END, //$ 16 CONCAT, //. 17 ALT, //| 18 DOT, // 19 ALPHA, // 20 NUM, 21 }; 22 23 enum DumpOrder 24 { 25 PRE, MID, POST 26 }; 27 28 29 #endif // __SYMBOL_H__ 30 31 32 //Parser.h 33 #ifndef __PARSER_H__ 34 #define __PARSER_H__ 35 36 #include <cstdlib> 37 #include <ctype.h> 38 #include <cstring> 39 #include <stack> 40 #include "Symbol.h" 41 42 using namespace std; 43 44 #define MAXN 1000000 45 46 typedef struct Node 47 { 48 TokenType token; 49 int subType; //<By张方雪 2013-5-18>用于处理一种token会对应多种的情形. 50 Node* left; 51 Node* right; 52 53 void Init(TokenType tok, int subT, Node* l, Node* r) 54 { 55 token = tok; 56 subType = subT; 57 left = l; 58 right = r; 59 } 60 }Node; 61 62 63 64 class Parser 65 { 66 public: 67 Parser(); 68 ~Parser(); 69 Node* Parse(char* regexStr); 70 void Dump(Node* root, DumpOrder order); 71 72 private: 73 Node* _buffer; 74 int _used; 75 int _pos; 76 char* _regex; 77 int _len; 78 79 Node* NewNode() 80 { 81 return &_buffer[_used++]; 82 } 83 char CurrentToken() 84 { 85 if(_pos >= _len) return -1; 86 return _regex[_pos]; 87 } 88 void NextToken() 89 { 90 _pos++; 91 } 92 93 Node* Parse();// recursive implementation 94 Node* ParseAlt(); 95 Node* ParseConcat(); 96 Node* ParsePlus(); 97 Node* ParseBracket(); 98 Node* ParseDot(); 99 Node* ParseAtom(); 100 Node* ParseMulti(); 101 102 void DumpMidOrder(Node* root); 103 void DumpPostOrder(Node* root); 104 105 }; 106 107 108 #endif // __PARSER_H__ 109 110 111 112 //Parser.cpp 113 #include <iostream> 114 #include "Parser.h" 115 116 117 using namespace std; 118 119 120 Parser::Parser() 121 { 122 _used = 0; 123 _buffer = new Node[MAXN]; 124 _pos = 0; 125 _regex = new char[MAXN]; 126 127 } 128 Parser::~Parser() 129 { 130 delete []_buffer; 131 } 132 133 134 Node* Parser::Parse(char* regexStr) 135 { 136 strcpy(_regex, regexStr); 137 _len = strlen(regexStr); 138 _pos = 0; 139 _used = 0; 140 141 return Parse(); 142 } 143 144 Node* Parser::Parse() 145 { 146 return ParseAlt(); 147 } 148 149 Node* Parser::ParseAlt() 150 { 151 Node* root = ParseConcat(); 152 char token = CurrentToken(); 153 154 if(token == '|') 155 { 156 NextToken(); 157 Node* leftChild = root; 158 Node* rightChild = ParseAlt(); 159 root = NewNode(); 160 root->Init(ALT, 0, leftChild, rightChild); 161 } 162 163 return root; 164 165 } 166 167 Node* Parser::ParseConcat() 168 { 169 Node* root = ParseMulti(); 170 char token = CurrentToken(); 171 172 if(isalnum(token) || token == '(') 173 { 174 Node* leftChild = root; 175 Node* rightChild = ParseConcat(); 176 root = NewNode(); 177 178 root->Init(CONCAT, 0, leftChild, rightChild); 179 // Dump(root, MID); 180 } 181 182 return root; 183 } 184 185 Node* Parser::ParseMulti() 186 { 187 Node* root = ParseBracket(); 188 char token = CurrentToken(); 189 while(token == '+' || token == '*' || token == '?') 190 { 191 Node* leftChild = root; 192 root = NewNode(); 193 switch(token) 194 { 195 case '+': 196 root->Init(PLUS, 0, leftChild, NULL); 197 break; 198 case '?': 199 root->Init(QUESTION, 0, leftChild, NULL); 200 break; 201 case '*': 202 root->Init(STAR, 0, leftChild, NULL); 203 break; 204 } 205 NextToken(); 206 token = CurrentToken(); 207 } 208 return root; 209 } 210 211 Node* Parser::ParseBracket() 212 { 213 Node* root = NULL; 214 char token = CurrentToken(); 215 if(token == '(') 216 { 217 NextToken(); 218 root = ParseAlt(); 219 if(CurrentToken() != ')') 220 { 221 cout << "Error " << CurrentToken() << endl; 222 return NULL; 223 224 } 225 NextToken(); 226 } 227 else 228 { 229 root = ParseAtom(); 230 } 231 return root; 232 } 233 234 Node* Parser::ParseAtom() 235 { 236 Node* root = NULL; 237 char token = CurrentToken(); 238 239 if(isalpha(token)) 240 { 241 root = NewNode(); 242 root->Init(ALPHA, token, NULL, NULL); 243 NextToken(); 244 return root; 245 } 246 else if(isalnum(token)) 247 { 248 root = NewNode(); 249 root->Init(NUM, token, NULL, NULL); 250 NextToken(); 251 return root; 252 } 253 else if(token == '.') 254 { 255 root = NewNode(); 256 root->Init(DOT, 0, NULL, NULL); 257 NextToken(); 258 return root; 259 } 260 else 261 { 262 cout << "出现未能处理的token" << endl; 263 return NULL; 264 } 265 266 267 } 268 269 void Parser::DumpMidOrder(Node* root) 270 { 271 if(root == NULL) return; 272 switch(root->token) 273 { 274 case ALPHA: 275 case NUM: 276 { 277 cout << char(root->subType) ; 278 break; 279 } 280 case PLUS: 281 { 282 cout << "("; 283 DumpMidOrder(root->left); 284 cout << ")+"; 285 break; 286 } 287 case STAR: 288 { 289 cout << "("; 290 DumpMidOrder(root->left); 291 cout << ")*"; 292 break; 293 } 294 case QUESTION: 295 { 296 cout << "("; 297 DumpMidOrder(root->left); 298 cout << ")?"; 299 break; 300 } 301 case CONCAT: 302 { 303 DumpMidOrder(root->left); 304 DumpMidOrder(root->right); 305 break; 306 } 307 case DOT: 308 { 309 cout << "."; 310 break; 311 } 312 case ALT: 313 { 314 DumpMidOrder(root->left); 315 cout << "|"; 316 DumpMidOrder(root->right); 317 break; 318 } 319 default: 320 { 321 cout << "非法的符号: " << root->token << endl; 322 return; 323 } 324 } 325 } 326 327 void Parser::DumpPostOrder(Node* root) 328 { 329 if(root == NULL) return; 330 switch(root->token) 331 { 332 case ALPHA: 333 case NUM: 334 { 335 cout << char(root->subType) ; 336 break; 337 } 338 case PLUS: 339 { 340 DumpPostOrder(root->left); 341 cout << "+"; 342 break; 343 } 344 case STAR: 345 { 346 DumpPostOrder(root->left); 347 cout << "*"; 348 break; 349 } 350 case QUESTION: 351 { 352 DumpPostOrder(root->left); 353 cout << "?"; 354 break; 355 } 356 case CONCAT: 357 { 358 DumpPostOrder(root->left); 359 DumpPostOrder(root->right); 360 cout << "$"; //<By张方雪 2013-5-18>先用这个当连接符吧. 361 break; 362 } 363 case DOT: 364 { 365 cout << "."; 366 break; 367 } 368 case ALT: 369 { 370 DumpPostOrder(root->left); 371 DumpPostOrder(root->right); 372 cout << "|"; 373 break; 374 } 375 default: 376 { 377 cout << "非法的符号: " << root->token << endl; 378 return; 379 } 380 } 381 } 382 383 void Parser::Dump(Node* root, DumpOrder order) 384 { 385 if(order == MID) 386 { 387 DumpMidOrder(root); 388 } 389 else if(order == POST) 390 { 391 DumpPostOrder(root); 392 } 393 394 cout << endl; 395 } 396 397 398 399 400 401 //test.cpp 402 #include <iostream> 403 #include "Parser.h" 404 405 using namespace std; 406 407 int main() 408 { 409 cout << "Hello world!" << endl; 410 Parser parser; 411 Node* root = parser.Parse("p|a(c*|.f++)+(cd|F|W|C)**"); 412 parser.Dump(root, MID); 413 root = parser.Parse("a|b|c|d+c*"); 414 parser.Dump(root, POST); 415 return 0; 416 }
代码写得乱糟糟, 恐有bug XD.
浙公网安备 33010602011771号