C语言使用正则表达式,可以利用pcre库,这个比较不错的哦。
在使用过程中,利用python进行测试正则表达式是否OK,后发现出现了问题。如下所示:
regex.c:11:18: warning: unknown escape sequence: '\/' [enabled by default]
char* url_re="(https?|ftp|mms):\/\/([A-z0-9]+[_\-]?[A-z0-9]?\.)*[A-z0-9]+\-?[A-z0-9]+\.[A-z]{2,}(\/.*)?";
^
regex.c:11:18: warning: unknown escape sequence: '\/' [enabled by default]
regex.c:11:18: warning: unknown escape sequence: '\-' [enabled by default]
regex.c:11:18: warning: unknown escape sequence: '\.' [enabled by default]
regex.c:11:18: warning: unknown escape sequence: '\-' [enabled by default]
regex.c:11:18: warning: unknown escape sequence: '\.' [enabled by default]
regex.c:11:18: warning: unknown escape sequence: '\/' [enabled by default]
这到底怎么回事呢?利用Python执行结果是OK的呀。代码如下:
#!/usr/bin/env python # import re import sys import os #restr="(https?|ftp|mms):\/\/([A-z0-9]+[_\-]?[A-z0-9]+\.)*[A-z0-9]+\-?[A-z0-9]+\.[A-z]{2,}(\/.*)*\/?" def geturl(url=''): restr="(https?|ftp|mms):\/\/([A-z0-9]+[_\-]?[A-z0-9]?\.)*[A-z0-9]+\-?[A-z0-9]+\.[A-z]{2,}" pattern = re.compile(restr) match=re.search(pattern, url) if match: return match.group() ################# GetLine ############################ def dealUrl(fmtfile): i=0 file = open(fmtfile,'r') fo = open("tmp.txt",'w') while 1: line = file.readline() if not line: break newline=geturl(line) if(newline!=None): print(i, newline) fo.writelines(''.join([newline,'\n'])) i+=1 ################# Main ############################## if __name__=='__main__': if(len(sys.argv)<2): filename='url.info' else: filename=sys.argv[1] dealUrl(filename)
查询后,发现有可能是在C语言中,如果使用正则表达式,那么转移字符需要使用双份的,也即:"/"需要用“\\/”,对url_re做了调整后,再次测试发现编译告警消失,执行结果也是OK啦。
1 int filter(char* str,char* url) 2 { 3 pcre *re; 4 const char* error; 5 int erroffset; 6 int ovector[RE_OVERCOUNT]; 7 int rc; 8 char* url_re="(https?|ftp|mms):\\/\\/([A-z0-9]+[_\\-]?[A-z0-9]?\\.)*[A-z0-9]+\\-?[A-z0-9]+\\.[A-z]{2,}"; 9 10 if(str==NULL || url==NULL) return 0; 11 printf("str: %s\n", str); 12 re = pcre_compile(url_re, 0, &error, &erroffset, NULL); 13 if(re == NULL){ 14 printf("PCRE pcre_compile failed at offset %d: %s\n", erroffset, error); 15 return 0; 16 } 17 char *p=str; 18 if((rc=pcre_exec(re,NULL,p,strlen(p),0,0,ovector,RE_OVERCOUNT))!=PCRE_ERROR_NOMATCH){ 19 char* url_start = p + ovector[0]; 20 int urllen = ovector[1] - ovector[0]; 21 strncpy(url, url_start, urllen); 22 printf("urllen %d, url:%s\n", urllen, url); 23 return urllen; 24 } 25 pcre_free(re); 26 return 0; 27 }