后缀数组应用4: 求不可重叠最长重复子串

View Code
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<iostream>
#include<vector>
#include<string>
#include<math.h>
#include<map>
#include<set>
#include<algorithm>
using namespace std;
#define MAXN 10010

int sa[MAXN], rank[MAXN], sum[MAXN], height[MAXN];
int wa[MAXN], wb[MAXN], wx[MAXN], wsum[MAXN];
char str[MAXN];
int dp[1010][20];
/*
RMQ:
dp[i][j] = max(dp[i][j-1], dp[i + 2 ^(j-1)][j-1])
dp[i][0] = A[i];

求区间最值[i,j]
int L =lg( j - i + 1 )
return max(dp[i][L], dp[j + 1 - 2 ^ L][L]); 
*/
//预处理height数组 
void pre(int n)
{
  for( int i = 0; i <= n; i++)
        dp[i][0] = height[i];
  int L = (int) log2(n); 
  for( int j = 1; j <= L; j++)
  {
     for( int i = 1; i <= n + 1 - (1<<j); i++)
        dp[i][j] = min(dp[i][j-1], dp[i + (1<<(j-1))][j-1]);
        
  }  
}

int get_min( int a, int b)
{
  int L = (int) log2(b - a + 1 );
  return min(dp[a][L], dp[b + 1 - (1<<L)][L]);       
}

//比较字符串是否相等 
int cmp( int *r, int a, int b, int l)
{
  return (r[a] == r[b] && r[a+l] == r[b+l]);    
}

//倍增算法求sa数组 
void get_sa(char *r, int *sa, int n, int m) //r为字符串, sa数组, n为字符串长度, m为字符串最大值 
{
   int i, j,p, *x = wa, *y = wb, *t;
   for( i = 0; i < m; i++)
       sum[i] = 0;
   //对长度为1时后缀字符串排序 
   for( i = 0; i < n; i++)
        sum[ x[i] = r[i] ]++;  //x相当于rank,但不是真正rank 
   for( i = 1; i < m; i++) 
        sum[i] += sum[i-1];
   for( i = n-1; i >= 0; i--)
        sa[--sum[x[i]]] = i; 
   //对长度为2,4,...的后缀字符串排序
   for(j = 1, p = 1; p < n && j <= n; j *= 2)
   {
      //首先对关键字y排序,排序后的结果保存在y数组中,即是这个后缀字符串的起始位置 
      for(p = 0,i = n - j; i < n; i++)
           y[p++] = i;
      for(i = 0; i < n; i++) if( sa[i] >= j )  y[p++] = sa[i] - j;
      //然后对关键字x排序,先要获取第1关键字x
      for(i = 0; i < n; i++)
           wx[i] = x[y[i]]; 
      for(i = 0; i < m; i++)
           wsum[i] = 0;
      for(i = 0; i < n; i++)
           wsum[ wx[i] ]++;
      for(i = 1; i < m; i++)
           wsum[i] += wsum[i-1];
      for(i = n - 1;i >= 0; i--)
           sa[--wsum[wx[i]]] = y[i];
      //更新x
      t = x, x = y, y = t;
      for( x[sa[0]] = 0,i = 1, p = 1; i < n; i++)
           x[ sa[i] ] = cmp(y, sa[i-1], sa[i], j) ? p - 1 : p++;   
   }     
}

//h[i] = height[rank[i]], h[i] >= h[i-1] - 1
void get_height(char *r, int n)
{
  int i, j, k = 0;//sa[0] = len 就是我们补的那个0 
  for(i = 1; i <= n; i++)
    rank[sa[i]] = i;
  for(i = 0; i < n ; height[rank[i++]] = k )  
    for( k ? k-- : 0, j = sa[rank[i]-1]; r[i+k] == r[j+k]; k++);
}

//枚举height数组中最长公共前缀大于可x的 
int jugde( int x, int n)
{
  int Min, Max;
  for( int i = 1; i <= n; i++)
  {
     if( height[i] >= x )
     {
        Max = max(sa[i], sa[i-1]);
        Min = min(sa[i], sa[i-1]);
        if( Max - Min >= x )
            return 1;       
            
     }
     else
     {
        Min = sa[i];
        Max = sa[i];      
     }
     
  }
  return 0;  
}

//枚举不可重叠最长重复字串长度mid 
int find(int n, int l, int r)
{
  int ans = 0;
  while( l <= r )
  {
     int mid = (l + r) / 2;
     if( jugde(mid, n) )
     {
        ans = mid;
        l = mid + 1;       
     }
     else
        r = mid - 1;      
  }
  return ans;  
}
  
int main( )
{
  int a, b, n, m;
  while( scanf("%s",str) != EOF )
  {
    int len = strlen(str);
    str[len] = '0';
    str[len+1] = 0;
    memset(wa,0,sizeof(wa));
    memset(wb,0,sizeof(wb));
    memset(sa,0,sizeof(sa));
    memset(height,0,sizeof(height));
    get_sa(str, sa, len + 1, 255);
    get_height( str, len );
    pre(len);  
    printf("%d\n",find(len, 0, len-1));   
  }
  return 0;
}

求不可重叠最长重复子串,二分。。

posted on 2012-09-26 16:16  more think, more gains  阅读(203)  评论(0编辑  收藏  举报

导航