• 博客园logo
  • 会员
  • 众包
  • 新闻
  • 博问
  • 闪存
  • 赞助商
  • HarmonyOS
  • Chat2DB
    • 搜索
      所有博客
    • 搜索
      当前博客
  • 写随笔 我的博客 短消息 简洁模式
    用户头像
    我的博客 我的园子 账号设置 会员中心 简洁模式 ... 退出登录
    注册 登录
皇图霸业谈笑间
更高、更快、更强
博客园    首页    新随笔    联系   管理    订阅  订阅
bm-search 算法

 

 
/* Aho-Corasick text search algorithm implementation 
 *  
 * For more information visit 
 *		- http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf 
 */ 
using System; 
using System.Collections; 
 
namespace EeekSoft.Text 
{ 
	/// <summary> 
	/// Interface containing all methods to be implemented 
	/// by string search algorithm 
	/// </summary> 
	public interface IStringSearchAlgorithm 
	{ 
		#region Methods & Properties 
 
		/// <summary> 
		/// List of keywords to search for 
		/// </summary> 
		string[] Keywords { get; set; } 
		 
 
		/// <summary> 
		/// Searches passed text and returns all occurrences of any keyword 
		/// </summary> 
		/// <param name="text">Text to search</param> 
		/// <returns>Array of occurrences</returns> 
		StringSearchResult[] FindAll(string text); 
 
		/// <summary> 
		/// Searches passed text and returns first occurrence of any keyword 
		/// </summary> 
		/// <param name="text">Text to search</param> 
		/// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns> 
		StringSearchResult FindFirst(string text); 
 
		/// <summary> 
		/// Searches passed text and returns true if text contains any keyword 
		/// </summary> 
		/// <param name="text">Text to search</param> 
		/// <returns>True when text contains any keyword</returns> 
		bool ContainsAny(string text); 
 
		#endregion 
	} 
 
	/// <summary> 
	/// Structure containing results of search  
	/// (keyword and position in original text) 
	/// </summary> 
	public struct StringSearchResult 
	{ 
		#region Members 
		 
		private int _index; 
		private string _keyword; 
 
		/// <summary> 
		/// Initialize string search result 
		/// </summary> 
		/// <param name="index">Index in text</param> 
		/// <param name="keyword">Found keyword</param> 
		public StringSearchResult(int index,string keyword) 
		{ 
			_index=index; _keyword=keyword; 
		} 
 
 
		/// <summary> 
		/// Returns index of found keyword in original text 
		/// </summary> 
		public int Index 
		{ 
			get { return _index; } 
		} 
 
 
		/// <summary> 
		/// Returns keyword found by this result 
		/// </summary> 
		public string Keyword 
		{ 
			get { return _keyword; } 
		} 
 
 
		/// <summary> 
		/// Returns empty search result 
		/// </summary> 
		public static StringSearchResult Empty 
		{ 
			get { return new StringSearchResult(-1,""); } 
		} 
 
		#endregion 
	} 
 
 
	/// <summary> 
	/// Class for searching string for one or multiple  
	/// keywords using efficient Aho-Corasick search algorithm 
	/// </summary> 
	public class StringSearch : IStringSearchAlgorithm 
	{ 
		#region Objects 
 
		/// <summary> 
		/// Tree node representing character and its  
		/// transition and failure function 
		/// </summary> 
		class TreeNode 
		{ 
			#region Constructor & Methods 
 
			/// <summary> 
			/// Initialize tree node with specified character 
			/// </summary> 
			/// <param name="parent">Parent node</param> 
			/// <param name="c">Character</param> 
			public TreeNode(TreeNode parent,char c) 
			{ 
				_char=c; _parent=parent; 
				_results=new ArrayList(); 
				_resultsAr=new string[] {}; 
 
				_transitionsAr=new TreeNode[] {}; 
				_transHash=new Hashtable(); 
			} 
 
 
			/// <summary> 
			/// Adds pattern ending in this node 
			/// </summary> 
			/// <param name="result">Pattern</param> 
			public void AddResult(string result) 
			{ 
				if (_results.Contains(result)) return; 
				_results.Add(result); 
				_resultsAr=(string[])_results.ToArray(typeof(string)); 
			} 
 
			/// <summary> 
			/// Adds trabsition node 
			/// </summary> 
			/// <param name="node">Node</param> 
			public void AddTransition(TreeNode node) 
			{ 
				_transHash.Add(node.Char,node); 
				TreeNode[] ar=new TreeNode[_transHash.Values.Count]; 
				_transHash.Values.CopyTo(ar,0); 
				_transitionsAr=ar; 
			} 
 
 
			/// <summary> 
			/// Returns transition to specified character (if exists) 
			/// </summary> 
			/// <param name="c">Character</param> 
			/// <returns>Returns TreeNode or null</returns> 
			public TreeNode GetTransition(char c) 
			{ 
				return (TreeNode)_transHash[c]; 
			} 
 
 
			/// <summary> 
			/// Returns true if node contains transition to specified character 
			/// </summary> 
			/// <param name="c">Character</param> 
			/// <returns>True if transition exists</returns> 
			public bool ContainsTransition(char c) 
			{ 
				return GetTransition(c)!=null; 
			} 
 
			#endregion 
			#region Properties 
			 
			private char _char; 
			private TreeNode _parent; 
			private TreeNode _failure; 
			private ArrayList _results; 
			private TreeNode[] _transitionsAr; 
			private string[] _resultsAr; 
			private Hashtable _transHash; 
 
			/// <summary> 
			/// Character 
			/// </summary> 
			public char Char 
			{ 
				get { return _char; } 
			} 
 
 
			/// <summary> 
			/// Parent tree node 
			/// </summary> 
			public TreeNode Parent 
			{ 
				get { return _parent; } 
			} 
 
 
			/// <summary> 
			/// Failure function - descendant node 
			/// </summary> 
			public TreeNode Failure 
			{ 
				get { return _failure; } 
				set { _failure=value; }  
			} 
 
 
			/// <summary> 
			/// Transition function - list of descendant nodes 
			/// </summary> 
			public TreeNode[] Transitions 
			{ 
				get { return _transitionsAr; } 
			} 
 
 
			/// <summary> 
			/// Returns list of patterns ending by this letter 
			/// </summary> 
			public string[] Results 
			{ 
				get { return _resultsAr; } 
			} 
 
			#endregion 
		} 
 
		#endregion 
		#region Local fields 
		 
		/// <summary> 
		/// Root of keyword tree 
		/// </summary> 
		private TreeNode _root; 
 
		/// <summary> 
		/// Keywords to search for 
		/// </summary> 
		private string[] _keywords; 
 
		#endregion 
 
		#region Initialization 
				 
		/// <summary> 
		/// Initialize search algorithm (Build keyword tree) 
		/// </summary> 
		/// <param name="keywords">Keywords to search for</param> 
		public StringSearch(string[] keywords) 
		{ 
			Keywords=keywords; 
		} 
 
 
		/// <summary> 
		/// Initialize search algorithm with no keywords 
		/// (Use Keywords property) 
		/// </summary> 
		public StringSearch() 
		{ } 
 
		#endregion 
		#region Implementation 
 
		/// <summary> 
		/// Build tree from specified keywords 
		/// </summary> 
		void BuildTree() 
		{ 
			// Build keyword tree and transition function 
			_root=new TreeNode(null,' '); 
			foreach(string p in _keywords) 
			{ 
				// add pattern to tree 
				TreeNode nd=_root; 
				foreach(char c in p) 
				{ 
					TreeNode ndNew=null; 
					foreach(TreeNode trans in nd.Transitions) 
						if (trans.Char==c) { ndNew=trans; break; } 
 
					if (ndNew==null)  
					{  
						ndNew=new TreeNode(nd,c); 
						nd.AddTransition(ndNew); 
					} 
					nd=ndNew; 
				} 
				nd.AddResult(p); 
			} 
 
			// Find failure functions 
			ArrayList nodes=new ArrayList(); 
			// level 1 nodes - fail to root node 
			foreach(TreeNode nd in _root.Transitions) 
			{ 
				nd.Failure=_root; 
				foreach(TreeNode trans in nd.Transitions) nodes.Add(trans); 
			} 
			// other nodes - using BFS 
			while(nodes.Count!=0) 
			{ 
				ArrayList newNodes=new ArrayList(); 
				foreach(TreeNode nd in nodes) 
				{ 
					TreeNode r=nd.Parent.Failure; 
					char c=nd.Char; 
 
					while(r!=null&&!r.ContainsTransition(c)) r=r.Failure; 
					if (r==null) 
						nd.Failure=_root; 
					else 
					{ 
						nd.Failure=r.GetTransition(c);         
						foreach(string result in nd.Failure.Results) 
							nd.AddResult(result); 
					} 
   
					// add child nodes to BFS list  
					foreach(TreeNode child in nd.Transitions) 
						newNodes.Add(child); 
				} 
				nodes=newNodes; 
			} 
			_root.Failure=_root;		 
		} 
 
 
		#endregion 
		#region Methods & Properties 
 
		/// <summary> 
		/// Keywords to search for (setting this property is slow, because 
		/// it requieres rebuilding of keyword tree) 
		/// </summary> 
		public string[] Keywords 
		{ 
			get { return _keywords; } 
			set  
			{ 
				_keywords=value;  
				BuildTree(); 
			} 
		} 
 
 
		/// <summary> 
		/// Searches passed text and returns all occurrences of any keyword 
		/// </summary> 
		/// <param name="text">Text to search</param> 
		/// <returns>Array of occurrences</returns> 
		public StringSearchResult[] FindAll(string text) 
		{ 
			ArrayList ret=new ArrayList(); 
			TreeNode ptr=_root; 
			int index=0; 
 
			while(index<text.Length) 
			{ 
				TreeNode trans=null; 
				while(trans==null) 
				{ 
					trans=ptr.GetTransition(text[index]); 
					if (ptr==_root) break; 
					if (trans==null) ptr=ptr.Failure; 
				} 
				if (trans!=null) ptr=trans; 
 
				foreach(string found in ptr.Results) 
					ret.Add(new StringSearchResult(index-found.Length+1,found)); 
				index++; 
			} 
			return (StringSearchResult[])ret.ToArray(typeof(StringSearchResult)); 
		} 
 
 
		/// <summary> 
		/// Searches passed text and returns first occurrence of any keyword 
		/// </summary> 
		/// <param name="text">Text to search</param> 
		/// <returns>First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword)</returns> 
		public StringSearchResult FindFirst(string text) 
		{ 
			ArrayList ret=new ArrayList(); 
			TreeNode ptr=_root; 
			int index=0; 
 
			while(index<text.Length) 
			{ 
				TreeNode trans=null; 
				while(trans==null) 
				{ 
					trans=ptr.GetTransition(text[index]); 
					if (ptr==_root) break; 
					if (trans==null) ptr=ptr.Failure; 
				} 
				if (trans!=null) ptr=trans; 
 
				foreach(string found in ptr.Results) 
					return new StringSearchResult(index-found.Length+1,found); 
				index++; 
			} 
			return StringSearchResult.Empty; 
		} 
 
 
		/// <summary> 
		/// Searches passed text and returns true if text contains any keyword 
		/// </summary> 
		/// <param name="text">Text to search</param> 
		/// <returns>True when text contains any keyword</returns> 
		public bool ContainsAny(string text) 
		{ 
			TreeNode ptr=_root; 
			int index=0; 
 
			while(index<text.Length) 
			{ 
				TreeNode trans=null; 
				while(trans==null) 
				{ 
					trans=ptr.GetTransition(text[index]); 
					if (ptr==_root) break; 
					if (trans==null) ptr=ptr.Failure; 
				} 
				if (trans!=null) ptr=trans; 
 
				if (ptr.Results.Length>0) return true; 
				index++; 
			} 
			return false; 
		} 
 
		#endregion 
	} 
} 

 

 

/// <summary>
/// Implements a multi-stage byte array. Uses less memory than a byte
/// array large enough to hold an offset for each Unicode character.
/// </summary>
class UnicodeSkipArray
{
    // Pattern length used for default byte value
    private byte _patternLength;
    // Default byte array (filled with default value)
    private byte[] _default;
    // Array to hold byte arrays
    private byte[][] _skipTable;
    // Size of each block
    private const int BlockSize = 0x100;

    /// <summary>
    /// Initializes this UnicodeSkipTable instance
    /// </summary>
    /// <param name="patternLength">Length of BM pattern</param>
    public UnicodeSkipArray(int patternLength)
    {
        // Default value (length of pattern being searched)
        _patternLength = (byte)patternLength;
        // Default table (filled with default value)
        _default = new byte[BlockSize];
        InitializeBlock(_default);
        // Master table (array of arrays)
        _skipTable = new byte[BlockSize][];
        for (int i = 0; i < BlockSize; i++)
            _skipTable[i] = _default;
    }

    /// <summary>
    /// Sets/gets a value in the multi-stage tables.
    /// </summary>
    /// <param name="index"></param>
    /// <returns></returns>
    public byte this[int index]
    {
        get
        {
            // Return value
            return _skipTable[index / BlockSize][index % BlockSize];
        }
        set
        {
            // Get array that contains value to set
            int i = (index / BlockSize);
            // Does it reference the default table?
            if (_skipTable[i] == _default)
            {
                // Yes, value goes in a new table
                _skipTable[i] = new byte[BlockSize];
                InitializeBlock(_skipTable[i]);
            }
            // Set value
            _skipTable[i][index % BlockSize] = value;
        }
    }

    /// <summary>
    /// Initializes a block to hold the current "nomatch" value.
    /// </summary>
    /// <param name="block">Block to be initialized</param>
    private void InitializeBlock(byte[] block)
    {
        for (int i = 0; i < BlockSize; i++)
            block[i] = _patternLength;
    }
}

/// <summary>
/// Implements Boyer-Moore search algorithm
/// </summary>
class BoyerMoore
{
    private string _pattern;
    private bool _ignoreCase;
    private UnicodeSkipArray _skipArray;

    // Returned index when no match found
    public const int InvalidIndex = -1;

    public BoyerMoore(string pattern)
    {
        Initialize(pattern, false);
    }
        
    public BoyerMoore(string pattern, bool ignoreCase)
    {
        Initialize(pattern, ignoreCase);
    }

    /// <summary>
    /// Initializes this instance to search a new pattern.
    /// </summary>
    /// <param name="pattern">Pattern to search for</param>
    public void Initialize(string pattern)
    {
        Initialize(pattern, false);
    }

    /// <summary>
    /// Initializes this instance to search a new pattern.
    /// </summary>
    /// <param name="pattern">Pattern to search for</param>
    /// <param name="ignoreCase">If true, search is case-insensitive</param>
    public void Initialize(string pattern, bool ignoreCase)
    {
        _pattern = pattern;
        _ignoreCase = ignoreCase;

        // Create multi-stage skip table
        _skipArray = new UnicodeSkipArray(_pattern.Length);
        // Initialize skip table for this pattern
        if (_ignoreCase)
        {
            for (int i = 0; i < _pattern.Length - 1; i++)
            {
                _skipArray[Char.ToLower(_pattern[i])] = (byte)(_pattern.Length - i - 1);
                _skipArray[Char.ToUpper(_pattern[i])] = (byte)(_pattern.Length - i - 1);
            }
        }
        else
        {
            for (int i = 0; i < _pattern.Length - 1; i++)
                _skipArray[_pattern[i]] = (byte)(_pattern.Length - i - 1);
        }
    }

    /// <summary>
    /// Searches for the current pattern within the given text
    /// starting at the beginning.
    /// </summary>
    /// <param name="text"></param>
    /// <returns></returns>
    public int Search(string text)
    {
        return Search(text, 0);
    }

    /// <summary>
    /// Searches for the current pattern within the given text
    /// starting at the specified index.
    /// </summary>
    /// <param name="text">Text to search</param>
    /// <param name="startIndex">Offset to begin search</param>
    /// <returns></returns>
    public int Search(string text, int startIndex)
    {
        int i = startIndex;

        // Loop while there's still room for search term
        while (i <= (text.Length - _pattern.Length))
        {
            // Look if we have a match at this position
            int j = _pattern.Length - 1;
            if (_ignoreCase)
            {
                while (j >= 0 && Char.ToUpper(_pattern[j]) == Char.ToUpper(text[i + j]))
                    j--;
            }
            else
            {
                while (j >= 0 && _pattern[j] == text[i + j])
                    j--;
            }

            if (j < 0)
            {
                // Match found
                return i;
            }

            // Advance to next comparision
            i += Math.Max(_skipArray[text[i + j]] - _pattern.Length + 1 + j, 1);
        }
        // No match found
        return InvalidIndex;
    }
}

 

posted on 2013-12-11 12:48  布颜书  阅读(480)  评论(0)    收藏  举报
刷新页面返回顶部
博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3