明天的明天 永远的永远 未知的一切 我与你一起承担 ??

是非成败转头空 青山依旧在 几度夕阳红 。。。
  博客园  :: 首页  :: 管理

.Net : 脏字处理类,效率很高。。。。。

Posted on 2009-12-23 16:05  且行且思  阅读(737)  评论(0编辑  收藏  举报

BadWordParse 类:

 

using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.IO;

namespace charCheck
{
    
public class BadWordParse
    {


        
private HashSet<string> hash = new HashSet<string>();
        
private byte[] fastCheck = new byte[char.MaxValue];
        
private BitArray charCheck = new BitArray(char.MaxValue);
        
private int maxWordLength = 0;
        
private int minWordLength = int.MaxValue;
        
private bool _isHave = false;
        
private string _replaceString = "*";
        
private char _splitString = '|';
        
private string _newWord;
        
private string _badWordFilePath;


        
/// <summary>
        
/// 是否含有脏字
        
/// </summary>
        public bool IsHave
        {
            
get { return _isHave; }
        }

        
/// <summary>
        
/// 替换后字符串
        
/// </summary>
        public string ReplaceString
        {
            
set { _replaceString = value; }
        }
        
/// <summary>
        
/// 脏字字典切割符
        
/// </summary>
        public char SplitString
        {
            
set { _splitString = value; }
        }

        
/// <summary>
        
/// 更新后的字符串
        
/// </summary>
        public string NewWord
        {
            
get { return _newWord; }
        }

        
/// <summary>
        
/// 脏字字典文档路径
        
/// </summary>
        public string BadWordFilePath
        {
            
get { return _badWordFilePath; }
            
set { _badWordFilePath = value; }
        }

        
public BadWordParse(string filePath)
        {
            _badWordFilePath 
= filePath;
            
string srList = string.Empty;
            
if (File.Exists(_badWordFilePath))
            {
                StreamReader sr 
= new StreamReader(_badWordFilePath, Encoding.GetEncoding("gb2312"));
                srList 
= sr.ReadToEnd();
                sr.Close();
                sr.Dispose();
            }
            
string[] badwords = srList.Split('|');
            
foreach (string word in badwords)
            {
                maxWordLength 
= Math.Max(maxWordLength, word.Length);
                minWordLength 
= Math.Min(minWordLength, word.Length);
                
for (int i = 0; i < 7 && i < word.Length; i++)
                {
                    fastCheck[word[i]] 
|= (byte)(1 << i);
                }

                
for (int i = 7; i < word.Length; i++)
                {
                    fastCheck[word[i]] 
|= 0x80;
                }

                
if (word.Length == 1)
                {
                    charCheck[word[
0]] = true;
                }
                
else
                {
                    hash.Add(word);
                }
            }
        }
        
public bool HasBadWord(string text)
        {
            
int index = 0;

            
while (index < text.Length)
            {


                
if ((fastCheck[text[index]] & 1== 0)
                {
                    
while (index < text.Length - 1 && (fastCheck[text[++index]] & 1== 0) ;
                }

                
//单字节检测
                if (minWordLength == 1 && charCheck[text[index]])
                {
                    
return true;
                }


                
//多字节检测
                for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
                {
                    
//快速排除
                    if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)
                    {
                        
break;
                    }

                    
if (j + 1 >= minWordLength)
                    {
                        
string sub = text.Substring(index, j + 1);

                        
if (hash.Contains(sub))
                        {
                            
return true;
                        }
                    }
                }
                index
++;
            }
            
return false;
        }

        
public string ReplaceBadWord(string text)
        {
            
int index = 0;

            
for (index = 0; index < text.Length; index++)
            {
                
if ((fastCheck[text[index]] & 1== 0)
                {
                    
while (index < text.Length - 1 && (fastCheck[text[++index]] & 1== 0) ;
                }

                
//单字节检测
                if (minWordLength == 1 && charCheck[text[index]])
                {
                    
//return true;
                    _isHave = true;
                    text 
= text.Replace(text[index], _replaceString[0]);
                    
continue;
                }
                
//多字节检测
                for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
                {

                    
//快速排除
                    if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)
                    {
                        
break;
                    }

                    
if (j + 1 >= minWordLength)
                    {
                        
string sub = text.Substring(index, j + 1);

                        
if (hash.Contains(sub))
                        {

                            
//替换字符操作
                            _isHave = true;
                            
char cc = _replaceString[0];
                            
string rp = _replaceString.PadRight((j + 1), cc);
                            text 
= text.Replace(sub, rp);
                            
//记录新位置
                            index += j;
                            
break;
                        }
                    }
                }
            }
            _newWord 
= text;
            
return text;
        }
    }


}

 

 

测试代码:

 

代码
 string filePath = "F://charCheck/charCheck/badword.txt";  
            
string testString = "";
            System.IO.StreamReader sr 
= new System.IO.StreamReader(filePath, System.Text.Encoding.GetEncoding("gb2312"));
            
//testString = sr.ReadToEnd();
            sr.Close();
            sr.Dispose();
            
//uint t = GetTickCount();
            BadWordParse bwp = new BadWordParse(filePath);
            
string parsedString = bwp.ReplaceBadWord(testString);
            
//uint time = GetTickCount() - t;
            
//Console.Write("使用时间:" + time.ToString());
            
//Console.Write("\r\n");
            
//Console.Write("原始字符串" + parsedString);
            
//Console.Write("\r\n");
            
//Console.Write("替换后字符串" + parsedString);