/// <summary>
/// 过滤词DFA算法实现
/// </summary>
public class ForbiddentWordLibrary
{
/// <summary>
/// 用分行过滤词文件来初始化过滤词库
/// </summary>
/// <param name="path">文件路径</param>
public ForbiddentWordLibrary( string path )
{
try
{
words = new HashSet<string>();
using( var stream = new StreamReader( path, Encoding.UTF8 ) )
{
while( !stream.EndOfStream )
{
words.Add( stream.ReadLine().Trim() );
}
}
InitLibrary();
}
catch( Exception ex )
{
throw ex;
}
}
/// <summary>
/// 找到输入字符串内所有敏感词
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
public List<string> GetAllForbiddenWords( string input )
{
List<string> result = new List<string>();
for( int i = 0; i < input.Length; i++ )
{
int length = SearchFW( input, i );
if( length > 0 )
{
result.Add( input.Substring( i, length ) );
i = i + length - 1;
}
}
return result;
}
/// <summary>
/// 搜索输入的字符串,查找所有敏感词,找到则返回敏感词长度
/// </summary>
/// <param name="input">输入字符串</param>
/// <param name="beginIndex">查找的起始位置</param>
/// <returns></returns>
private int SearchFW( string input, int beginIndex )
{
bool flag = false;
int len = 0;
Hashtable ht = lib;
for( int i = beginIndex; i < input.Length; i++ )
{
var c = input[ i ];
var obj = ht[ c.ToString() ];
if( obj == null )
break;
else
{
len++;
ht = (Hashtable)obj;
if( (int)ht[ "IsEnd" ] == 1 )
flag = true;
}
}
if( !flag )
len = 0;
return len;
}
/// <summary>
/// 初始化词库结构
/// </summary>
private void InitLibrary()
{
lib = new Hashtable( words.Count );
var tmp = lib;
foreach( string k in words )
{
for( int i = 0; i < k.Length; i++ )
{
var c = k[ i ].ToString();
if( tmp.ContainsKey( c ) )
{
tmp = (Hashtable)tmp[ c ];
}
else
{
var nht = new Hashtable();
nht.Add( "IsEnd", 0 );
tmp.Add( c, nht );
tmp = nht;
}
if( i == k.Length - 1 )
{
if( tmp.ContainsKey( "IsEnd" ) )
tmp[ "IsEnd" ] = 1;
else
tmp.Add( "IsEnd", 1 );
}
}
tmp = lib;
}
}
/// <summary>
/// 原始过滤词数据集
/// </summary>
private HashSet<string> words;
/// <summary>
/// 过滤词库
/// </summary>
private Hashtable lib;
}