在定义后缀树(Suffix Tree)时,我们给出了一段简洁的描述:
A suffix tree is a compressed trie for all the suffixes of a text.
后缀数组(Suffix Array)的定义也同样简洁:
A suffix array is a sorted array of all suffixes of a text.
后缀数组由 Manber & Myers 在 1990 年首先提出《Suffix arrays: a new method for on-line string searches》,用以替代后缀树,并且改进了存储空间的需求。其与后缀树的的关系有:
- 后缀数组可以通过对后缀树做深度优先遍历(DFT: Depth First Traversal)来进行构建,对所有的边(Edge)做字典序(Lexicographical Order)遍历。
- 通过使用后缀集合和最长公共前缀数组(LCP Array: Longest Common Prefix Array)来构建后缀树,可在 O(n) 时间内完成,例如使用 Ukkonen 算法。构造后缀数组同样也可以在 O(n) 时间内完成。
- 每个通过后缀树解决的问题,都可以通过组合使用后缀数组和额外的信息(例如:LCP Array)来解决。
下面用一个例子来解释后缀数组结构。假设文本 Text = "banana",这里增加一个末尾字符 ""以使所有的后缀与前缀不重复,要求"" 为 Text 中不存在并且按字典序排序要小于所有其他字符。(假设数组从 1 开始)
那么对于 Text = "banana$" 的后缀列表如下:
对所有后缀进行字典序(Lexicographical Order)排序:
上表中右侧列即为后缀数组(Suffix Array),SA = [7, 6, 4, 2, 1, 5, 3]。
这样,例如 SA[3] 的值为 4,就代表 Text = "banana$" 中从位置 4 开始的后缀 "ana$"。
在 2009 年,Nong, Ge; Zhang, Sen; Chan, Wai Hong 等发表了论文《Linear Suffix Array Construction by Almost Pure Induced-Sorting》,首次实现了如下 3 个目标:
- minimal asymptotic complexity Θ(n)
- lightweight in space, meaning little or no working memory beside the text and the suffix array itself is needed
- fast in practice
Yuta Mori 对该算法做了具体的实现。
代码示例
1 /*
2 * SAIS.cs for SAIS-CSharp
3 * Copyright (c) 2010 Yuta Mori. All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use,
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following
12 * conditions:
13 *
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 namespace SuffixArray
28 {
29 internal interface BaseArray
30 {
31 int this[int i]
32 {
33 set;
34 get;
35 }
36 }
37
38 internal class ByteArray : BaseArray
39 {
40 private byte[] m_array;
41 private int m_pos;
42 public ByteArray(byte[] array, int pos)
43 {
44 m_pos = pos;
45 m_array = array;
46 }
47 ~ByteArray() { m_array = null; }
48 public int this[int i]
49 {
50 set { m_array[i + m_pos] = (byte)value; }
51 get { return (int)m_array[i + m_pos]; }
52 }
53 }
54
55 internal class CharArray : BaseArray
56 {
57 private char[] m_array;
58 private int m_pos;
59 public CharArray(char[] array, int pos)
60 {
61 m_pos = pos;
62 m_array = array;
63 }
64 ~CharArray() { m_array = null; }
65 public int this[int i]
66 {
67 set { m_array[i + m_pos] = (char)value; }
68 get { return (int)m_array[i + m_pos]; }
69 }
70 }
71
72 internal class IntArray : BaseArray
73 {
74 private int[] m_array;
75 private int m_pos;
76 public IntArray(int[] array, int pos)
77 {
78 m_pos = pos;
79 m_array = array;
80 }
81 ~IntArray() { m_array = null; }
82 public int this[int i]
83 {
84 set { m_array[i + m_pos] = value; }
85 get { return m_array[i + m_pos]; }
86 }
87 }
88
89 internal class StringArray : BaseArray
90 {
91 private string m_array;
92 private int m_pos;
93 public StringArray(string array, int pos)
94 {
95 m_pos = pos;
96 m_array = array;
97 }
98 ~StringArray() { m_array = null; }
99 public int this[int i]
100 {
101 set { }
102 get { return (int)m_array[i + m_pos]; }
103 }
104 }
105
106 /// <summary>
107 /// An implementation of the induced sorting based suffix array construction algorithm.
108 /// </summary>
109 public static class SAIS
110 {
111 private const int MINBUCKETSIZE = 256;
112
113 private static
114 void
115 getCounts(BaseArray T, BaseArray C, int n, int k)
116 {
117 int i;
118 for (i = 0; i < k; ++i) { C[i] = 0; }
119 for (i = 0; i < n; ++i) { C[T[i]] = C[T[i]] + 1; }
120 }
121 private static
122 void
123 getBuckets(BaseArray C, BaseArray B, int k, bool end)
124 {
125 int i, sum = 0;
126 if (end != false) { for (i = 0; i < k; ++i) { sum += C[i]; B[i] = sum; } }
127 else { for (i = 0; i < k; ++i) { sum += C[i]; B[i] = sum - C[i]; } }
128 }
129
130 /* sort all type LMS suffixes */
131 private static
132 void
133 LMSsort(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k)
134 {
135 int b, i, j;
136 int c0, c1;
137 /* compute SAl */
138 if (C == B) { getCounts(T, C, n, k); }
139 getBuckets(C, B, k, false); /* find starts of buckets */
140 j = n - 1;
141 b = B[c1 = T[j]];
142 --j;
143 SA[b++] = (T[j] < c1) ? ~j : j;
144 for (i = 0; i < n; ++i)
145 {
146 if (0 < (j = SA[i]))
147 {
148 if ((c0 = T[j]) != c1) { B[c1] = b; b = B[c1 = c0]; }
149 --j;
150 SA[b++] = (T[j] < c1) ? ~j : j;
151 SA[i] = 0;
152 }
153 else if (j < 0)
154 {
155 SA[i] = ~j;
156 }
157 }
158 /* compute SAs */
159 if (C == B) { getCounts(T, C, n, k); }
160 getBuckets(C, B, k, true); /* find ends of buckets */
161 for (i = n - 1, b = B[c1 = 0]; 0 <= i; --i)
162 {
163 if (0 < (j = SA[i]))
164 {
165 if ((c0 = T[j]) != c1) { B[c1] = b; b = B[c1 = c0]; }
166 --j;
167 SA[--b] = (T[j] > c1) ? ~(j + 1) : j;
168 SA[i] = 0;
169 }
170 }
171 }
172 private static
173 int
174 LMSpostproc(BaseArray T, int[] SA, int n, int m)
175 {
176 int i, j, p, q, plen, qlen, name;
177 int c0, c1;
178 bool diff;
179
180 /* compact all the sorted substrings into the first m items of SA
181 2*m must be not larger than n (proveable) */
182 for (i = 0; (p = SA[i]) < 0; ++i) { SA[i] = ~p; }
183 if (i < m)
184 {
185 for (j = i, ++i; ; ++i)
186 {
187 if ((p = SA[i]) < 0)
188 {
189 SA[j++] = ~p; SA[i] = 0;
190 if (j == m) { break; }
191 }
192 }
193 }
194
195 /* store the length of all substrings */
196 i = n - 1; j = n - 1; c0 = T[n - 1];
197 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1));
198 for (; 0 <= i; )
199 {
200 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) <= c1));
201 if (0 <= i)
202 {
203 SA[m + ((i + 1) >> 1)] = j - i; j = i + 1;
204 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1));
205 }
206 }
207
208 /* find the lexicographic names of all substrings */
209 for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i)
210 {
211 p = SA[i]; plen = SA[m + (p >> 1)]; diff = true;
212 if ((plen == qlen) && ((q + plen) < n))
213 {
214 for (j = 0; (j < plen) && (T[p + j] == T[q + j]); ++j) { }
215 if (j == plen) { diff = false; }
216 }
217 if (diff != false) { ++name; q = p; qlen = plen; }
218 SA[m + (p >> 1)] = name;
219 }
220
221 return name;
222 }
223
224 /* compute SA and BWT */
225 private static
226 void
227 induceSA(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k)
228 {
229 int b, i, j;
230 int c0, c1;
231 /* compute SAl */
232 if (C == B) { getCounts(T, C, n, k); }
233 getBuckets(C, B, k, false); /* find starts of buckets */
234 j = n - 1;
235 b = B[c1 = T[j]];
236 SA[b++] = ((0 < j) && (T[j - 1] < c1)) ? ~j : j;
237 for (i = 0; i < n; ++i)
238 {
239 j = SA[i]; SA[i] = ~j;
240 if (0 < j)
241 {
242 if ((c0 = T[--j]) != c1) { B[c1] = b; b = B[c1 = c0]; }
243 SA[b++] = ((0 < j) && (T[j - 1] < c1)) ? ~j : j;
244 }
245 }
246 /* compute SAs */
247 if (C == B) { getCounts(T, C, n, k); }
248 getBuckets(C, B, k, true); /* find ends of buckets */
249 for (i = n - 1, b = B[c1 = 0]; 0 <= i; --i)
250 {
251 if (0 < (j = SA[i]))
252 {
253 if ((c0 = T[--j]) != c1) { B[c1] = b; b = B[c1 = c0]; }
254 SA[--b] = ((j == 0) || (T[j - 1] > c1)) ? ~j : j;
255 }
256 else
257 {
258 SA[i] = ~j;
259 }
260 }
261 }
262 private static
263 int
264 computeBWT(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k)
265 {
266 int b, i, j, pidx = -1;
267 int c0, c1;
268 /* compute SAl */
269 if (C == B) { getCounts(T, C, n, k); }
270 getBuckets(C, B, k, false); /* find starts of buckets */
271 j = n - 1;
272 b = B[c1 = T[j]];
273 SA[b++] = ((0 < j) && (T[j - 1] < c1)) ? ~j : j;
274 for (i = 0; i < n; ++i)
275 {
276 if (0 < (j = SA[i]))
277 {
278 SA[i] = ~(c0 = T[--j]);
279 if (c0 != c1) { B[c1] = b; b = B[c1 = c0]; }
280 SA[b++] = ((0 < j) && (T[j - 1] < c1)) ? ~j : j;
281 }
282 else if (j != 0)
283 {
284 SA[i] = ~j;
285 }
286 }
287 /* compute SAs */
288 if (C == B) { getCounts(T, C, n, k); }
289 getBuckets(C, B, k, true); /* find ends of buckets */
290 for (i = n - 1, b = B[c1 = 0]; 0 <= i; --i)
291 {
292 if (0 < (j = SA[i]))
293 {
294 SA[i] = (c0 = T[--j]);
295 if (c0 != c1) { B[c1] = b; b = B[c1 = c0]; }
296 SA[--b] = ((0 < j) && (T[j - 1] > c1)) ? ~((int)T[j - 1]) : j;
297 }
298 else if (j != 0)
299 {
300 SA[i] = ~j;
301 }
302 else
303 {
304 pidx = i;
305 }
306 }
307 return pidx;
308 }
309
310 /* find the suffix array SA of T[0..n-1] in {0..k-1}^n
311 use a working space (excluding T and SA) of at most 2n+O(1) for a constant alphabet */
312 private static
313 int
314 sais_main(BaseArray T, int[] SA, int fs, int n, int k, bool isbwt)
315 {
316 BaseArray C, B, RA;
317 int i, j, b, m, p, q, name, pidx = 0, newfs;
318 int c0, c1;
319 uint flags = 0;
320
321 if (k <= MINBUCKETSIZE)
322 {
323 C = new IntArray(new int[k], 0);
324 if (k <= fs) { B = new IntArray(SA, n + fs - k); flags = 1; }
325 else { B = new IntArray(new int[k], 0); flags = 3; }
326 }
327 else if (k <= fs)
328 {
329 C = new IntArray(SA, n + fs - k);
330 if (k <= (fs - k)) { B = new IntArray(SA, n + fs - k * 2); flags = 0; }
331 else if (k <= (MINBUCKETSIZE * 4)) { B = new IntArray(new int[k], 0); flags = 2; }
332 else { B = C; flags = 8; }
333 }
334 else
335 {
336 C = B = new IntArray(new int[k], 0);
337 flags = 4 | 8;
338 }
339
340 /* stage 1: reduce the problem by at least 1/2
341 sort all the LMS-substrings */
342 getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */
343 for (i = 0; i < n; ++i) { SA[i] = 0; }
344 b = -1; i = n - 1; j = n; m = 0; c0 = T[n - 1];
345 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1));
346 for (; 0 <= i; )
347 {
348 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) <= c1));
349 if (0 <= i)
350 {
351 if (0 <= b) { SA[b] = j; } b = --B[c1]; j = i; ++m;
352 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1));
353 }
354 }
355 if (1 < m)
356 {
357 LMSsort(T, SA, C, B, n, k);
358 name = LMSpostproc(T, SA, n, m);
359 }
360 else if (m == 1)
361 {
362 SA[b] = j + 1;
363 name = 1;
364 }
365 else
366 {
367 name = 0;
368 }
369
370 /* stage 2: solve the reduced problem
371 recurse if names are not yet unique */
372 if (name < m)
373 {
374 if ((flags & 4) != 0) { C = null; B = null; }
375 if ((flags & 2) != 0) { B = null; }
376 newfs = (n + fs) - (m * 2);
377 if ((flags & (1 | 4 | 8)) == 0)
378 {
379 if ((k + name) <= newfs) { newfs -= k; }
380 else { flags |= 8; }
381 }
382 for (i = m + (n >> 1) - 1, j = m * 2 + newfs - 1; m <= i; --i)
383 {
384 if (SA[i] != 0) { SA[j--] = SA[i] - 1; }
385 }
386 RA = new IntArray(SA, m + newfs);
387 sais_main(RA, SA, newfs, m, name, false);
388 RA = null;
389
390 i = n - 1; j = m * 2 - 1; c0 = T[n - 1];
391 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1));
392 for (; 0 <= i; )
393 {
394 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) <= c1));
395 if (0 <= i)
396 {
397 SA[j--] = i + 1;
398 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1));
399 }
400 }
401
402 for (i = 0; i < m; ++i) { SA[i] = SA[m + SA[i]]; }
403 if ((flags & 4) != 0) { C = B = new IntArray(new int[k], 0); }
404 if ((flags & 2) != 0) { B = new IntArray(new int[k], 0); }
405 }
406
407 /* stage 3: induce the result for the original problem */
408 if ((flags & 8) != 0) { getCounts(T, C, n, k); }
409 /* put all left-most S characters into their buckets */
410 if (1 < m)
411 {
412 getBuckets(C, B, k, true); /* find ends of buckets */
413 i = m - 1; j = n; p = SA[m - 1]; c1 = T[p];
414 do
415 {
416 q = B[c0 = c1];
417 while (q < j) { SA[--j] = 0; }
418 do
419 {
420 SA[--j] = p;
421 if (--i < 0) { break; }
422 p = SA[i];
423 } while ((c1 = T[p]) == c0);
424 } while (0 <= i);
425 while (0 < j) { SA[--j] = 0; }
426 }
427 if (isbwt == false) { induceSA(T, SA, C, B, n, k); }
428 else { pidx = computeBWT(T, SA, C, B, n, k); }
429 C = null; B = null;
430 return pidx;
431 }
432
433 /*- Suffixsorting -*/
434 /* byte */
435 /// <summary>
436 /// Constructs the suffix array of a given string in linear time.
437 /// </summary>
438 /// <param name="T">input string</param>
439 /// <param name="SA">output suffix array</param>
440 /// <param name="n">length of the given string</param>
441 /// <returns>0 if no error occurred, -1 or -2 otherwise</returns>
442 public static
443 int
444 sufsort(byte[] T, int[] SA, int n)
445 {
446 if ((T == null) || (SA == null) ||
447 (T.Length < n) || (SA.Length < n)) { return -1; }
448 if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; }
449 return sais_main(new ByteArray(T, 0), SA, 0, n, 256, false);
450 }
451 /* char */
452 /// <summary>
453 /// Constructs the suffix array of a given string in linear time.
454 /// </summary>
455 /// <param name="T">input string</param>
456 /// <param name="SA">output suffix array</param>
457 /// <param name="n">length of the given string</param>
458 /// <returns>0 if no error occurred, -1 or -2 otherwise</returns>
459 public static
460 int
461 sufsort(char[] T, int[] SA, int n)
462 {
463 if ((T == null) || (SA == null) ||
464 (T.Length < n) || (SA.Length < n)) { return -1; }
465 if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; }
466 return sais_main(new CharArray(T, 0), SA, 0, n, 65536, false);
467 }
468 /* int */
469 /// <summary>
470 /// Constructs the suffix array of a given string in linear time.
471 /// </summary>
472 /// <param name="T">input string</param>
473 /// <param name="SA">output suffix array</param>
474 /// <param name="n">length of the given string</param>
475 /// <param name="k">alphabet size</param>
476 /// <returns>0 if no error occurred, -1 or -2 otherwise</returns>
477 public static
478 int
479 sufsort(int[] T, int[] SA, int n, int k)
480 {
481 if ((T == null) || (SA == null) ||
482 (T.Length < n) || (SA.Length < n) ||
483 (k <= 0)) { return -1; }
484 if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; }
485 return sais_main(new IntArray(T, 0), SA, 0, n, k, false);
486 }
487 /* string */
488 /// <summary>
489 /// Constructs the suffix array of a given string in linear time.
490 /// </summary>
491 /// <param name="T">input string</param>
492 /// <param name="SA">output suffix array</param>
493 /// <param name="n">length of the given string</param>
494 /// <returns>0 if no error occurred, -1 or -2 otherwise</returns>
495 public static
496 int
497 sufsort(string T, int[] SA, int n)
498 {
499 if ((T == null) || (SA == null) ||
500 (T.Length < n) || (SA.Length < n)) { return -1; }
501 if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; }
502 return sais_main(new StringArray(T, 0), SA, 0, n, 65536, false);
503 }
504
505 /*- Burrows-Wheeler Transform -*/
506 /* byte */
507 /// <summary>
508 /// Constructs the burrows-wheeler transformed string of a given string in linear time.
509 /// </summary>
510 /// <param name="T">input string</param>
511 /// <param name="U">output string</param>
512 /// <param name="A">temporary array</param>
513 /// <param name="n">length of the given string</param>
514 /// <returns>primary index if no error occurred, -1 or -2 otherwise</returns>
515 public static
516 int
517 bwt(byte[] T, byte[] U, int[] A, int n)
518 {
519 int i, pidx;
520 if ((T == null) || (U == null) || (A == null) ||
521 (T.Length < n) || (U.Length < n) || (A.Length < n)) { return -1; }
522 if (n <= 1) { if (n == 1) { U[0] = T[0]; } return n; }
523 pidx = sais_main(new ByteArray(T, 0), A, 0, n, 256, true);
524 U[0] = T[n - 1];
525 for (i = 0; i < pidx; ++i) { U[i + 1] = (byte)A[i]; }
526 for (i += 1; i < n; ++i) { U[i] = (byte)A[i]; }
527 return pidx + 1;
528 }
529 /* char */
530 /// <summary>
531 /// Constructs the burrows-wheeler transformed string of a given string in linear time.
532 /// </summary>
533 /// <param name="T">input string</param>
534 /// <param name="U">output string</param>
535 /// <param name="A">temporary array</param>
536 /// <param name="n">length of the given string</param>
537 /// <returns>primary index if no error occurred, -1 or -2 otherwise</returns>
538 public static
539 int
540 bwt(char[] T, char[] U, int[] A, int n)
541 {
542 int i, pidx;
543 if ((T == null) || (U == null) || (A == null) ||
544 (T.Length < n) || (U.Length < n) || (A.Length < n)) { return -1; }
545 if (n <= 1) { if (n == 1) { U[0] = T[0]; } return n; }
546 pidx = sais_main(new CharArray(T, 0), A, 0, n, 65536, true);
547 U[0] = T[n - 1];
548 for (i = 0; i < pidx; ++i) { U[i + 1] = (char)A[i]; }
549 for (i += 1; i < n; ++i) { U[i] = (char)A[i]; }
550 return pidx + 1;
551 }
552 /* int */
553 /// <summary>
554 /// Constructs the burrows-wheeler transformed string of a given string in linear time.
555 /// </summary>
556 /// <param name="T">input string</param>
557 /// <param name="U">output string</param>
558 /// <param name="A">temporary array</param>
559 /// <param name="n">length of the given string</param>
560 /// <param name="k">alphabet size</param>
561 /// <returns>primary index if no error occurred, -1 or -2 otherwise</returns>
562 public static
563 int
564 bwt(int[] T, int[] U, int[] A, int n, int k)
565 {
566 int i, pidx;
567 if ((T == null) || (U == null) || (A == null) ||
568 (T.Length < n) || (U.Length < n) || (A.Length < n) ||
569 (k <= 0)) { return -1; }
570 if (n <= 1) { if (n == 1) { U[0] = T[0]; } return n; }
571 pidx = sais_main(new IntArray(T, 0), A, 0, n, k, true);
572 U[0] = T[n - 1];
573 for (i = 0; i < pidx; ++i) { U[i + 1] = A[i]; }
574 for (i += 1; i < n; ++i) { U[i] = A[i]; }
575 return pidx + 1;
576 }
577 }
578 }
测试代码
1 /*
2 * suftest.cs for SAIS-CSharp
3 * Copyright (c) 2010 Yuta Mori. All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use,
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following
12 * conditions:
13 *
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 using SuffixArray;
28
29 namespace suftest
30 {
31 class MainClass
32 {
33 private static
34 int
35 check(byte[] T, int[] SA, int n, bool verbose)
36 {
37 int[] C = new int[256];
38 int i, p, q, t;
39 int c;
40
41 if (verbose) { System.Console.Write(@"sufcheck: "); }
42 if (n == 0)
43 {
44 if (verbose) { System.Console.WriteLine("Done."); }
45 return 0;
46 }
47
48 /* Check arguments. */
49 if ((T == null) || (SA == null) || (n < 0))
50 {
51 if (verbose) { System.Console.WriteLine("Invalid arguments."); }
52 return -1;
53 }
54
55 /* check range: [0..n-1] */
56 for (i = 0; i < n; ++i)
57 {
58 if ((SA[i] < 0) || (n <= SA[i]))
59 {
60 if (verbose)
61 {
62 System.Console.WriteLine("Out of the range [0," + (n - 1) + "].");
63 System.Console.WriteLine(" SA[" + i + "]=" + SA[i]);
64 }
65 return -2;
66 }
67 }
68
69 /* check first characters. */
70 for (i = 1; i < n; ++i)
71 {
72 if (T[SA[i - 1]] > T[SA[i]])
73 {
74 if (verbose)
75 {
76 System.Console.WriteLine("Suffixes in wrong order.");
77 System.Console.Write(" T[SA[" + (i - 1) + "]=" + SA[i - 1] + "]=" + T[SA[i - 1]]);
78 System.Console.WriteLine(" > T[SA[" + i + "]=" + SA[i] + "]=" + T[SA[i]]);
79 }
80 return -3;
81 }
82 }
83
84 /* check suffixes. */
85 for (i = 0; i < 256; ++i) { C[i] = 0; }
86 for (i = 0; i < n; ++i) { ++C[T[i]]; }
87 for (i = 0, p = 0; i < 256; ++i)
88 {
89 t = C[i];
90 C[i] = p;
91 p += t;
92 }
93
94 q = C[T[n - 1]];
95 C[T[n - 1]] += 1;
96 for (i = 0; i < n; ++i)
97 {
98 p = SA[i];
99 if (0 < p)
100 {
101 c = T[--p];
102 t = C[c];
103 }
104 else
105 {
106 c = T[p = n - 1];
107 t = q;
108 }
109 if ((t < 0) || (p != SA[t]))
110 {
111 if (verbose)
112 {
113 System.Console.WriteLine("Suffixes in wrong position.");
114 System.Console.WriteLine(" SA[" + t + "]=" + ((0 <= t) ? SA[t] : -1) + " or");
115 System.Console.WriteLine(" SA[" + i + "]=" + SA[i]);
116 }
117 return -4;
118 }
119 if (t != q)
120 {
121 ++C[c];
122 if ((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
123 }
124 }
125 C = null;
126
127 if (verbose) { System.Console.WriteLine("Done."); }
128 return 0;
129 }
130
131 public static
132 void
133 Main(string[] args)
134 {
135 int i;
136 for (i = 0; i < args.Length; ++i)
137 {
138 using (System.IO.FileStream fs = new System.IO.FileStream(
139 args[i],
140 System.IO.FileMode.Open,
141 System.IO.FileAccess.Read))
142 {
143 System.Console.Write(args[i] + @": {0:d} bytes ... ", fs.Length);
144 byte[] T = new byte[fs.Length];
145 int[] SA = new int[fs.Length];
146 fs.Read(T, 0, T.Length);
147
148 System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
149 sw.Start();
150 SAIS.sufsort(T, SA, T.Length);
151 sw.Stop();
152
153 double sec = (double)sw.ElapsedTicks / (double)System.Diagnostics.Stopwatch.Frequency;
154 System.Console.WriteLine(@"{0:f4} sec", sec);
155
156 check(T, SA, T.Length, true);
157 T = null;
158 SA = null;
159 }
160 }
161 }
162 }
163 }
后缀数组的应用
- 应用1:最长公共前缀
- 应用2:最长重复子串(不重叠)
- 应用3:最长重复子串(可重叠)
- 应用4:至少重复 k 次的最长子串(可重叠)
- 应用5:最长回文子串
- 应用6:最长公共子串
- 应用7:长度不小于 k 的公共子串的个数
- 应用8:至少出现在 k 个串中的最长子串
参考资料
- Suffix array
- Suffix Array Algorithm
- Trie vs. suffix tree vs. suffix array
- Replacing suffix trees with enhanced suffix arrays
- A Taxonomy of Suffix Array Construction Algorithms
- Suffix Array | Set 1 (Introduction)
- Linear Work Suffix Array Construction
- Simple Linear Work Suffix Array Construction
- Suffix arrays – a programming contest approach
- A suffix array implementation in C#
- An implementation of the induced sorting algorithm
- An Optimal Suffix Array Construction Algorithm
- Linear Suffix Array Construction by Almost Pure Induced-Sorting
- 后缀数组及应用
- 后缀数组&后缀树(Suffix Array & Suffix Tree)
- 求多个字符串的最大公共子串
- 后缀数组求最长重复子串
- Prof. Dr. Johannes Fischer Inducing Suffix and LCP Arrays in External Memory
- What's the current state-of-the-art suffix array construction algorithm?
- libdivsufsort : SACA_Benchmarks
- Suffix Array Construction Benchmark

浙公网安备 33010602011771号