/* Aho-Corasick text search algorithm implementation
*
* For more information visit
* - http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
*/
using System;
using System.Collections;
namespace EeekSoft.Text
{
/// <summary>
/// Interface containing all methods to be implemented
/// by string search algorithm
/// </summary>
public interface IStringSearchAlgorithm
{
#region Methods & Properties
/// <summary>
/// List of keywords to search for
/// </summary>
string[] Keywords { get; set; }
/// <summary>
/// Searches passed text and returns all occurrences of any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>Array of occurrences</returns>
StringSearchResult[] FindAll(string text);
/// <summary>
/// Searches passed text and returns first occurrence of any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns>
StringSearchResult FindFirst(string text);
/// <summary>
/// Searches passed text and returns true if text contains any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>True when text contains any keyword</returns>
bool ContainsAny(string text);
#endregion
}
/// <summary>
/// Structure containing results of search
/// (keyword and position in original text)
/// </summary>
public struct StringSearchResult
{
#region Members
private int _index;
private string _keyword;
/// <summary>
/// Initialize string search result
/// </summary>
/// <param name="index">Index in text</param>
/// <param name="keyword">Found keyword</param>
public StringSearchResult(int index,string keyword)
{
_index=index; _keyword=keyword;
}
/// <summary>
/// Returns index of found keyword in original text
/// </summary>
public int Index
{
get { return _index; }
}
/// <summary>
/// Returns keyword found by this result
/// </summary>
public string Keyword
{
get { return _keyword; }
}
/// <summary>
/// Returns empty search result
/// </summary>
public static StringSearchResult Empty
{
get { return new StringSearchResult(-1,""); }
}
#endregion
}
/// <summary>
/// Class for searching string for one or multiple
/// keywords using efficient Aho-Corasick search algorithm
/// </summary>
public class StringSearch : IStringSearchAlgorithm
{
#region Objects
/// <summary>
/// Tree node representing character and its
/// transition and failure function
/// </summary>
class TreeNode
{
#region Constructor & Methods
/// <summary>
/// Initialize tree node with specified character
/// </summary>
/// <param name="parent">Parent node</param>
/// <param name="c">Character</param>
public TreeNode(TreeNode parent,char c)
{
_char=c; _parent=parent;
_results=new ArrayList();
_resultsAr=new string[] {};
_transitionsAr=new TreeNode[] {};
_transHash=new Hashtable();
}
/// <summary>
/// Adds pattern ending in this node
/// </summary>
/// <param name="result">Pattern</param>
public void AddResult(string result)
{
if (_results.Contains(result)) return;
_results.Add(result);
_resultsAr=(string[])_results.ToArray(typeof(string));
}
/// <summary>
/// Adds trabsition node
/// </summary>
/// <param name="node">Node</param>
public void AddTransition(TreeNode node)
{
_transHash.Add(node.Char,node);
TreeNode[] ar=new TreeNode[_transHash.Values.Count];
_transHash.Values.CopyTo(ar,0);
_transitionsAr=ar;
}
/// <summary>
/// Returns transition to specified character (if exists)
/// </summary>
/// <param name="c">Character</param>
/// <returns>Returns TreeNode or null</returns>
public TreeNode GetTransition(char c)
{
return (TreeNode)_transHash[c];
}
/// <summary>
/// Returns true if node contains transition to specified character
/// </summary>
/// <param name="c">Character</param>
/// <returns>True if transition exists</returns>
public bool ContainsTransition(char c)
{
return GetTransition(c)!=null;
}
#endregion
#region Properties
private char _char;
private TreeNode _parent;
private TreeNode _failure;
private ArrayList _results;
private TreeNode[] _transitionsAr;
private string[] _resultsAr;
private Hashtable _transHash;
/// <summary>
/// Character
/// </summary>
public char Char
{
get { return _char; }
}
/// <summary>
/// Parent tree node
/// </summary>
public TreeNode Parent
{
get { return _parent; }
}
/// <summary>
/// Failure function - descendant node
/// </summary>
public TreeNode Failure
{
get { return _failure; }
set { _failure=value; }
}
/// <summary>
/// Transition function - list of descendant nodes
/// </summary>
public TreeNode[] Transitions
{
get { return _transitionsAr; }
}
/// <summary>
/// Returns list of patterns ending by this letter
/// </summary>
public string[] Results
{
get { return _resultsAr; }
}
#endregion
}
#endregion
#region Local fields
/// <summary>
/// Root of keyword tree
/// </summary>
private TreeNode _root;
/// <summary>
/// Keywords to search for
/// </summary>
private string[] _keywords;
#endregion
#region Initialization
/// <summary>
/// Initialize search algorithm (Build keyword tree)
/// </summary>
/// <param name="keywords">Keywords to search for</param>
public StringSearch(string[] keywords)
{
Keywords=keywords;
}
/// <summary>
/// Initialize search algorithm with no keywords
/// (Use Keywords property)
/// </summary>
public StringSearch()
{ }
#endregion
#region Implementation
/// <summary>
/// Build tree from specified keywords
/// </summary>
void BuildTree()
{
// Build keyword tree and transition function
_root=new TreeNode(null,' ');
foreach(string p in _keywords)
{
// add pattern to tree
TreeNode nd=_root;
foreach(char c in p)
{
TreeNode ndNew=null;
foreach(TreeNode trans in nd.Transitions)
if (trans.Char==c) { ndNew=trans; break; }
if (ndNew==null)
{
ndNew=new TreeNode(nd,c);
nd.AddTransition(ndNew);
}
nd=ndNew;
}
nd.AddResult(p);
}
// Find failure functions
ArrayList nodes=new ArrayList();
// level 1 nodes - fail to root node
foreach(TreeNode nd in _root.Transitions)
{
nd.Failure=_root;
foreach(TreeNode trans in nd.Transitions) nodes.Add(trans);
}
// other nodes - using BFS
while(nodes.Count!=0)
{
ArrayList newNodes=new ArrayList();
foreach(TreeNode nd in nodes)
{
TreeNode r=nd.Parent.Failure;
char c=nd.Char;
while(r!=null&&!r.ContainsTransition(c)) r=r.Failure;
if (r==null)
nd.Failure=_root;
else
{
nd.Failure=r.GetTransition(c);
foreach(string result in nd.Failure.Results)
nd.AddResult(result);
}
// add child nodes to BFS list
foreach(TreeNode child in nd.Transitions)
newNodes.Add(child);
}
nodes=newNodes;
}
_root.Failure=_root;
}
#endregion
#region Methods & Properties
/// <summary>
/// Keywords to search for (setting this property is slow, because
/// it requieres rebuilding of keyword tree)
/// </summary>
public string[] Keywords
{
get { return _keywords; }
set
{
_keywords=value;
BuildTree();
}
}
/// <summary>
/// Searches passed text and returns all occurrences of any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>Array of occurrences</returns>
public StringSearchResult[] FindAll(string text)
{
ArrayList ret=new ArrayList();
TreeNode ptr=_root;
int index=0;
while(index<text.Length)
{
TreeNode trans=null;
while(trans==null)
{
trans=ptr.GetTransition(text[index]);
if (ptr==_root) break;
if (trans==null) ptr=ptr.Failure;
}
if (trans!=null) ptr=trans;
foreach(string found in ptr.Results)
ret.Add(new StringSearchResult(index-found.Length+1,found));
index++;
}
return (StringSearchResult[])ret.ToArray(typeof(StringSearchResult));
}
/// <summary>
/// Searches passed text and returns first occurrence of any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns>
public StringSearchResult FindFirst(string text)
{
ArrayList ret=new ArrayList();
TreeNode ptr=_root;
int index=0;
while(index<text.Length)
{
TreeNode trans=null;
while(trans==null)
{
trans=ptr.GetTransition(text[index]);
if (ptr==_root) break;
if (trans==null) ptr=ptr.Failure;
}
if (trans!=null) ptr=trans;
foreach(string found in ptr.Results)
return new StringSearchResult(index-found.Length+1,found);
index++;
}
return StringSearchResult.Empty;
}
/// <summary>
/// Searches passed text and returns true if text contains any keyword
/// </summary>
/// <param name="text">Text to search</param>
/// <returns>True when text contains any keyword</returns>
public bool ContainsAny(string text)
{
TreeNode ptr=_root;
int index=0;
while(index<text.Length)
{
TreeNode trans=null;
while(trans==null)
{
trans=ptr.GetTransition(text[index]);
if (ptr==_root) break;
if (trans==null) ptr=ptr.Failure;
}
if (trans!=null) ptr=trans;
if (ptr.Results.Length>0) return true;
index++;
}
return false;
}
#endregion
}
}
/// <summary>
/// Implements a multi-stage byte array. Uses less memory than a byte
/// array large enough to hold an offset for each Unicode character.
/// </summary>
class UnicodeSkipArray
{
// Pattern length used for default byte value
private byte _patternLength;
// Default byte array (filled with default value)
private byte[] _default;
// Array to hold byte arrays
private byte[][] _skipTable;
// Size of each block
private const int BlockSize = 0x100;
/// <summary>
/// Initializes this UnicodeSkipTable instance
/// </summary>
/// <param name="patternLength">Length of BM pattern</param>
public UnicodeSkipArray(int patternLength)
{
// Default value (length of pattern being searched)
_patternLength = (byte)patternLength;
// Default table (filled with default value)
_default = new byte[BlockSize];
InitializeBlock(_default);
// Master table (array of arrays)
_skipTable = new byte[BlockSize][];
for (int i = 0; i < BlockSize; i++)
_skipTable[i] = _default;
}
/// <summary>
/// Sets/gets a value in the multi-stage tables.
/// </summary>
/// <param name="index"></param>
/// <returns></returns>
public byte this[int index]
{
get
{
// Return value
return _skipTable[index / BlockSize][index % BlockSize];
}
set
{
// Get array that contains value to set
int i = (index / BlockSize);
// Does it reference the default table?
if (_skipTable[i] == _default)
{
// Yes, value goes in a new table
_skipTable[i] = new byte[BlockSize];
InitializeBlock(_skipTable[i]);
}
// Set value
_skipTable[i][index % BlockSize] = value;
}
}
/// <summary>
/// Initializes a block to hold the current "nomatch" value.
/// </summary>
/// <param name="block">Block to be initialized</param>
private void InitializeBlock(byte[] block)
{
for (int i = 0; i < BlockSize; i++)
block[i] = _patternLength;
}
}
/// <summary>
/// Implements Boyer-Moore search algorithm
/// </summary>
class BoyerMoore
{
private string _pattern;
private bool _ignoreCase;
private UnicodeSkipArray _skipArray;
// Returned index when no match found
public const int InvalidIndex = -1;
public BoyerMoore(string pattern)
{
Initialize(pattern, false);
}
public BoyerMoore(string pattern, bool ignoreCase)
{
Initialize(pattern, ignoreCase);
}
/// <summary>
/// Initializes this instance to search a new pattern.
/// </summary>
/// <param name="pattern">Pattern to search for</param>
public void Initialize(string pattern)
{
Initialize(pattern, false);
}
/// <summary>
/// Initializes this instance to search a new pattern.
/// </summary>
/// <param name="pattern">Pattern to search for</param>
/// <param name="ignoreCase">If true, search is case-insensitive</param>
public void Initialize(string pattern, bool ignoreCase)
{
_pattern = pattern;
_ignoreCase = ignoreCase;
// Create multi-stage skip table
_skipArray = new UnicodeSkipArray(_pattern.Length);
// Initialize skip table for this pattern
if (_ignoreCase)
{
for (int i = 0; i < _pattern.Length - 1; i++)
{
_skipArray[Char.ToLower(_pattern[i])] = (byte)(_pattern.Length - i - 1);
_skipArray[Char.ToUpper(_pattern[i])] = (byte)(_pattern.Length - i - 1);
}
}
else
{
for (int i = 0; i < _pattern.Length - 1; i++)
_skipArray[_pattern[i]] = (byte)(_pattern.Length - i - 1);
}
}
/// <summary>
/// Searches for the current pattern within the given text
/// starting at the beginning.
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public int Search(string text)
{
return Search(text, 0);
}
/// <summary>
/// Searches for the current pattern within the given text
/// starting at the specified index.
/// </summary>
/// <param name="text">Text to search</param>
/// <param name="startIndex">Offset to begin search</param>
/// <returns></returns>
public int Search(string text, int startIndex)
{
int i = startIndex;
// Loop while there's still room for search term
while (i <= (text.Length - _pattern.Length))
{
// Look if we have a match at this position
int j = _pattern.Length - 1;
if (_ignoreCase)
{
while (j >= 0 && Char.ToUpper(_pattern[j]) == Char.ToUpper(text[i + j]))
j--;
}
else
{
while (j >= 0 && _pattern[j] == text[i + j])
j--;
}
if (j < 0)
{
// Match found
return i;
}
// Advance to next comparision
i += Math.Max(_skipArray[text[i + j]] - _pattern.Length + 1 + j, 1);
}
// No match found
return InvalidIndex;
}
}