用
http://www.cnblogs.com/xingd/archive/2008/01/31/1060425.html这个算法替换敏感词汇,可以全部把我的脏字典中的敏感词汇全部命中,但这个方法怎么后面一半的脏字就不能替换呢??
奇怪了
HashSet<string> hash = new HashSet<string>();
byte[] fastCheck = new byte[char.MaxValue];
byte[] fastLength = new byte[char.MaxValue];
BitArray charCheck = new BitArray(char.MaxValue);
BitArray endCheck = new BitArray(char.MaxValue);
int maxWordLength = 0;
int minWordLength = int.MaxValue;
string wordPath = ConfigurationManager.AppSettings["badWords"];
string badWordTxtPath = System.Web.HttpContext.Current.Server.MapPath(wordPath);
string[] badWords = null;
if (System.IO.File.Exists(badWordTxtPath))
{
StreamReader sr = new StreamReader(badWordTxtPath, Encoding.Default);
badWords = sr.ReadToEnd().Split('|');
//初始化脏字典
foreach (string word in badWords)
{
maxWordLength = Math.Max(maxWordLength, word.Length);
minWordLength = Math.Min(minWordLength, word.Length);
for (int i = 0; i < 7 && i < word.Length; i++)
{
fastCheck[word[i]] |= (byte)(1 << i);
}
for (int i = 7; i < word.Length; i++)
{
fastCheck[word[i]] |= 0x80;
}
if (word.Length == 1)
{
charCheck[word[0]] = true;
}
else
{
fastLength[word[0]] |= (byte)(1 << (Math.Min(7, word.Length - 2)));
endCheck[word[word.Length - 1]] = true;
hash.Add(word);
}
}
//判断脏字是否出现在一个字符串中
int index = 0;
while (index < strContent.Length)
{
int count = 1;
if (index > 0 || (fastCheck[strContent[index]] & 1) == 0)
{
while (index < strContent.Length - 1 && (fastCheck[strContent[++index]] & 1) == 0) ;
}
char begin = strContent[index];
if (minWordLength == 1 && charCheck[begin])
{
break ;
}
for (int j = 1; j <= Math.Min(maxWordLength, strContent.Length - index - 1); j++)
{
char current = strContent[index + j];
if ((fastCheck[current] & 1) == 0)
{
++count;
}
if ((fastCheck[current] & (1 << Math.Min(j, 7))) == 0)
{
break;
}
if (j + 1 >= minWordLength)
{
if ((fastLength[begin] & (1 << Math.Min(j - 1, 7))) > 0 && endCheck[current])
{
string sub = strContent.Substring(index, j + 1);
if (hash.Contains(sub) )
{
strContent = strContent.Replace(sub, "**敏感词汇已替换**");
break;
}
}
}
}
index += count;
}
return strContent;
}
else
{
return "脏字典文件不存在!";
}