# 字符串匹配算法 - KMP

### KMP匹配算法

    public static bool IsSubString(string a, string b)
{
if (string.IsNullOrEmpty(a) ||
string.IsNullOrEmpty(b) ||
b.Length > a.Length)
return false;
int[] particalMatchTbl = new int[b.Length];
for (int i = 0; i < b.Length; i++)
{
particalMatchTbl[i] = GetPartialMatchCount(b.Substring(0, i + 1));
}

int startPosInA = 0;
int currPosInB = 0;

for (startPosInA = 0; startPosInA <= a.Length - b.Length; )
{
for (; currPosInB < b.Length; currPosInB++)
{
if (a[startPosInA + currPosInB] != b[currPosInB])
break;
}
if (currPosInB == b.Length)
return true;
//If position in B string is 0, not need to move in B, just increase pos in A
if (currPosInB == 0)
startPosInA++;
else
{
//currPosInB is the number already matched in string B
//particalMatchTbl[currPosInB - 1] is the max partial match length of matched string
//currPosInB - particalMatchTbl[currPosInB - 1] mean how much char could be skipped
//
//     |
//a    ABCDAB ABCDABCDABDE
//b    ABCDABD
//           |
//in this case, in b string, 'ABCDAB' part is matched, AB is the max partial string
//So the first part 'ABCD' could be skipped
//
//         |
//a    ABCDAB ABCDABCDABDE
//b        ABCDABD
//           |
startPosInA += currPosInB - particalMatchTbl[currPosInB - 1];
//start from 0, so partial match length is the the next value in B need to check
currPosInB = particalMatchTbl[currPosInB - 1];
}
}

return false;
}

public static int GetPartialMatchCount(string str)
{
int commonLength = 0;
for (int i = 1; i <= str.Length - 1; i++)
{
if (str.Substring(0, i) == str.Substring(str.Length - i, i))
commonLength = i;
}
return commonLength;
}


    public static int[] BuildJumpTable(string str)
{
//this table has two meanings:
//1. mean how many chars the prefix and suffix shared.
//2. because the array start from 0, this value also mean if current position is the last matched
//   position, which position the match algo should continue in this array.
int[] next = new int[str.Length];
// first char have 0 shared prefix and suffix
next[0] = 0;

//i is a stright forward cusor,
for (int i = 1, j = 0; i < str.Length; i++)
{
//use the jump table already generated, if str[i] not match str[j],
while (j > 0 && str[i] != str[j])
{
j = next[j - 1];
}

if (str[i] == str[j])
{
j++;
}

next[i] = j;
}
return next;
}


• 这里的局部匹配表其实有两层含义，必须明白这两层含义才能够理解这段算法。一个是阮一峰博客中指的最长的匹配字符串，我姑且叫做匹配表；而另一个含义则利用了数组计数从0开始的这个特点，表达的意思是当这个最后一个被匹配的字符，那么下一次匹配从哪个位置开始，姑且叫做跳跃表。
• j = next[j - 1] 这个地方就是当做跳跃表来使用，而next[i] = j则是表示匹配表，我觉得只有区分开来才能够更好的理解。
• i是一直增长的数，表示已经匹配到那个位置。而j表达的是下一个要匹配的位置同时又表示已经匹配了多少个，这也是因为数组从0开始才会导致这两个值相等。
• 比较巧妙的一块代码就是while循环那里，这段代码其实在算这个jump table的时候也已经用了一些kmp算法的思想在里面。如果当前的j和i不匹配，如果j比0大就说明j - 1那个位置一定是匹配的，而next[j-1]跳跃表里面存着恰恰就是下一个要尝试匹配的位置。就这样一直回溯回去就能够找到相等的那个字符，或者是找到第0个。

    public static bool IsSubString(string a, string b)
{
if (string.IsNullOrEmpty(a) ||
string.IsNullOrEmpty(b) ||
b.Length > a.Length)
return false;

int[] next = BuildJumpTable(b);

for (int posInA = 0, posInB = 0; posInA < a.Length; posInA++)
{
//if posInB > 0 mean at least posInB - 1 is matched, so got the next position need to match
while (posInB > 0 && a[posInA] != b[posInB])
posInB = next[posInB - 1];

//if they match, move posInB forward
if (a[posInA] == b[posInB])
posInB++;

if (posInB == b.Length)
return true;
}
return false;
}


posted @ 2014-07-17 07:48  imjustice  阅读(1913)  评论(2编辑  收藏  举报