【字符串问题】求一个字符串中重复出现的最长的子串

2013-09-14 15:34:16

用后缀数组求一个字符串中重复出现的最长的子串。

  1. 用C++中的string类可以很方便地进行操作,需将后缀数组保存在vector<string>,如下面代码中的string版本所示,但这样就会因为<string>有很大的开销;
  2. 直接用字符指针指向后缀字符串的首地址,可以节省很大的空间,如下面代码中的char *版本所示.
  3. 注意使用char *版本时,用qsort函数最后缀字符串数组排序,需要提供comp函数,该函数的写法如下:
1 int pStrcmp(const void *p,const void *q)
2 {
3     return ( strcmp( *(char **)p , *(char **)q ) );
4 }

更多关于该函数的说明,详见博文http://www.cnblogs.com/youngforever/articles/3321469.html

代码(string版本):

  1 #include <iostream>
  2 #include <cassert>
  3 #include <string>        //用string类模板,必须包含该头文件
  4 #include <vector>        //用vector模板,必须包含该头文件
  5 #include <algorithm>   //用sort函数,必须包含该头文件
  6 using namespace std;
  7 
  8 //获取两个字符串的最长公共子串
  9 size_t GetLCS(const string &str1,const string &str2)
 10 {
 11     size_t len1 = str1.length();
 12     size_t len2 = str2.length();
 13 
 14     size_t end  = len1 < len2 ? len1 : len2;
 15 
 16     size_t index = 0;
 17     size_t commenLen = 0;
 18 
 19     for (index = 0;index < end;++index)
 20     {
 21         if (str1.at(index) == str2.at(index))
 22         {
 23             ++commenLen;
 24         }
 25         else
 26         {
 27             break;
 28         }
 29     }
 30 
 31     return commenLen;
 32 }
 33 
 34 //获取字符串中重复出现且最长的子串
 35 size_t FindLongeStringApearTwice(const string &srcStr,string &subStr)
 36 {
 37     vector<string> vecStr;
 38 
 39     size_t index;
 40     size_t end = srcStr.length();
 41 
 42     string tmpStr;
 43     size_t tmpLen = 0;
 44     size_t maxLen = 0;
 45 
 46     for ( index = 0;index < end;++index)   //生成后缀数组
 47     {
 48         tmpStr = srcStr.substr(index,(end - 1 - index));
 49         vecStr.push_back(tmpStr);
 50         //tmpStr.clear();  //此处的清楚是不必要的,可以删去
 51     }
 52 
 53     sort(vecStr.begin(),vecStr.end());  //对后缀字符串数组排序
 54 
 55     vector<string>::iterator iter;
 56 
 57     for (iter = vecStr.begin();(iter + 1) != vecStr.end();++iter)  //求相邻string的LCS
 58     {
 59         tmpLen = GetLCS(*iter,*(iter + 1));
 60 
 61         if(tmpLen > maxLen)
 62         {
 63             maxLen = tmpLen;
 64             subStr = (*iter).substr(0,maxLen);
 65         }
 66     }
 67 
 68     return maxLen;
 69 }
 70 
 71 
 72 
 73 
 74 //测试FindLongeStringApearTwice
 75 void TestDriver()
 76 {
 77     string strArray[] = {"0123456","yyabcdabjcabceg","abcbcbcabc","hello,li mei! hello,li lei!"};
 78     size_t arrayLength = 4;
 79 
 80     string srcStr;
 81     string subStr;
 82     size_t maxLen = 0;
 83 
 84     for (size_t index = 0;index < arrayLength;++index)
 85     {
 86         //srcStr.clear();  //此处的清楚是不必要的,可以删去
 87         srcStr = strArray[index];
 88         maxLen = FindLongeStringApearTwice(srcStr,subStr);
 89 
 90         cout<<"the source string is : "<<srcStr<<endl;
 91         cout<<"the longest sub string is : "<<subStr<<endl;
 92         cout<<"the max length is : "<<maxLen<<endl;
 93         cout<<endl;
 94     }
 95 }
 96 
 97 int main()
 98 {
 99     TestDriver();
100     return 0;
101 }


代码(char *版本):

  1 #include <iostream>
  2 #include <cassert>
  3 #include <string>
  4 #include <vector>
  5 #include <algorithm>
  6 using namespace std;
  7 
  8 size_t GetLCS(const char *str1,const char *str2)
  9 {
 10     assert(str1 != NULL && str2 != NULL);
 11 
 12     size_t len1 = strlen(str1);
 13     size_t len2 = strlen(str2);
 14 
 15     size_t end  = len1 < len2 ? len1 : len2;
 16 
 17     size_t index = 0;
 18     size_t commenLen = 0;
 19 
 20     for (index = 0;index < end;++index)
 21     {
 22         if ( *(str1 + index) == *(str2 + index) )
 23         {
 24             ++commenLen;
 25         }
 26         else
 27         {
 28             break;
 29         }
 30     }
 31 
 32     return commenLen;
 33 }
 34 
 35 typedef char *  pCHAR;
 36 
 37 int pStrcmp(const void *p,const void *q)
 38 {
 39     return ( strcmp( *(char **)p , *(char **)q ) );
 40 }
 41 
 42 void DisplayString(const char *pStr)
 43 {
 44     size_t index = 0;
 45 
 46     while (*(pStr + index))
 47     {
 48         cout<<*(pStr + index);
 49         ++index;
 50     }
 51     cout<<endl;
 52 }
 53 
 54 size_t FindLongestStringApearTwice(const char *pSrcStr,char *&pSubStr)
 55 {
 56     assert(pSrcStr != NULL && pSubStr != NULL);
 57 
 58     const size_t lenOfSrcStr  = strlen(pSrcStr);
 59     pCHAR *pSuffixArray = new pCHAR[lenOfSrcStr];
 60 
 61     size_t index;
 62     size_t end = strlen(pSrcStr);
 63 
 64     size_t tmpLen = 0;
 65     size_t maxLen = 0;
 66 
 67     for ( index = 0;index < end;++index)
 68     {
 69         pSuffixArray[index] = (char *)pSrcStr + index;
 70         //DisplayString(pSuffixArray[index]);
 71     }
 72 
 73     qsort(pSuffixArray,lenOfSrcStr,sizeof(pCHAR),pStrcmp);
 74 /*
 75     for ( index = 0;index < end;++index )
 76     {
 77         DisplayString(pSuffixArray[index]);
 78     }*/
 79 
 80     for (index = 0;index + 1 < end;++index) 
 81     {
 82         tmpLen = GetLCS(pSuffixArray[index],pSuffixArray[index + 1]);
 83 
 84         if(tmpLen > maxLen)
 85         {
 86             maxLen = tmpLen;
 87             pSubStr = pSuffixArray[index];
 88         }
 89     }
 90 
 91     return maxLen;
 92 }
 93 
 94 
 95 void TestDriver()
 96 {
 97     pCHAR strArray[] = {"0123456","yyabcdabjcabceg","abcbcbcabc","hello,li mei! hello,li lei!"};
 98     size_t arrayLength = 4;
 99 
100     pCHAR srcStr;
101     pCHAR subStr;
102     size_t maxLen = 0;
103 
104     for (size_t index = 0;index < arrayLength;++index)
105     {
106         srcStr = strArray[index];
107         maxLen = FindLongestStringApearTwice(srcStr,subStr);
108 
109         cout<<"the source string is : "<<srcStr<<endl;
110         cout<<"the longest sub string is : ";
111 
112         for (size_t i = 0;i < maxLen;++i)
113         {
114             cout<<*(subStr + i);
115         }
116         cout<<endl;
117 
118         cout<<"the max length is : "<<maxLen<<endl;
119         cout<<endl;
120     }
121 }
122 
123 
124 int main()
125 {
126     TestDriver();
127     return 0;
128 }

测试结果:

the source string is : 0123456
the longest sub string is :
the max length is : 0

the source string is : yyabcdabjcabceg
the longest sub string is : abc
the max length is : 3

the source string is : abcbcbcabc
the longest sub string is : bcbc
the max length is : 4

the source string is : hello,li mei! hello,li lei!
the longest sub string is : hello,li
the max length is : 9

请按任意键继续. . .

 

posted @ 2013-09-14 16:48  永不止步,永无止境  阅读(1342)  评论(0编辑  收藏  举报