DFA算法C#实现

/// <summary>
    /// 过滤词DFA算法实现
    /// </summary>
    public class ForbiddentWordLibrary
    {
        /// <summary>
        /// 用分行过滤词文件来初始化过滤词库
        /// </summary>
        /// <param name="path">文件路径</param>
        public ForbiddentWordLibrary( string path )
        {
            try
            {
                words = new HashSet<string>();
                using( var stream = new StreamReader( path, Encoding.UTF8 ) )
                {
                    while( !stream.EndOfStream )
                    {
                        words.Add( stream.ReadLine().Trim() );
                    }
                }
                InitLibrary();
            }
            catch( Exception ex )
            {
                throw ex;
            }
        }

        /// <summary>
        /// 找到输入字符串内所有敏感词
        /// </summary>
        /// <param name="input"></param>
        /// <returns></returns>
        public List<string> GetAllForbiddenWords( string input )
        {
            List<string> result = new List<string>();
            for( int i = 0; i < input.Length; i++ )
            {
                int length = SearchFW( input, i );
                if( length > 0 )
                {
                    result.Add( input.Substring( i, length ) );
                    i = i + length - 1;
                }
            }

            return result;
        }

        /// <summary>
        /// 搜索输入的字符串,查找所有敏感词,找到则返回敏感词长度
        /// </summary>
        /// <param name="input">输入字符串</param>
        /// <param name="beginIndex">查找的起始位置</param>
        /// <returns></returns>
        private int SearchFW( string input, int beginIndex )
        {
            bool flag = false;
            int len = 0;
            Hashtable ht = lib;
            for( int i = beginIndex; i < input.Length; i++ )
            {
                var c = input[ i ];
                var obj = ht[ c.ToString() ];
                if( obj == null )
                    break;
                else
                {
                    len++;
                    ht = (Hashtable)obj;
                    if( (int)ht[ "IsEnd" ] == 1 )
                        flag = true;
                }
            }

            if( !flag )
                len = 0;

            return len;
        }

        /// <summary>
        /// 初始化词库结构
        /// </summary>
        private void InitLibrary()
        {
            lib = new Hashtable( words.Count );
            var tmp = lib;
            foreach( string k in words )
            {
                for( int i = 0; i < k.Length; i++ )
                {
                    var c = k[ i ].ToString();
                    if( tmp.ContainsKey( c ) )
                    {
                        tmp = (Hashtable)tmp[ c ];
                    }
                    else
                    {
                        var nht = new Hashtable();
                        nht.Add( "IsEnd", 0 );
                        tmp.Add( c, nht );
                        tmp = nht;
                    }

                    if( i == k.Length - 1 )
                    {
                        if( tmp.ContainsKey( "IsEnd" ) )
                            tmp[ "IsEnd" ] = 1;
                        else
                            tmp.Add( "IsEnd", 1 );
                    }
                }
                tmp = lib;
            }
        }

        /// <summary>
        /// 原始过滤词数据集
        /// </summary>
        private HashSet<string> words;
        /// <summary>
        /// 过滤词库
        /// </summary>
        private Hashtable lib;
    }

 

posted @ 2022-10-24 14:14  黄明辉  阅读(86)  评论(0编辑  收藏  举报