聊天系统违禁词过滤

网络游戏作为一种多人交互游戏,方便玩家在游戏中交流是很重要的。

为了防止玩家在游戏中说违禁的东西,所有的聊天系统都需要对聊天信息进行过滤。

本文描述了一种简单的基于DFA的算法用于过滤聊天内容中的违禁词,算法的运行复杂度,遍历

输入字符串n,最多对每个字符执行一次二分查找lgn,所以最坏情况下也是O(nlgn).

因为编写这段代码的时间很短,所以代码写得有点丑陋,但已经足以描述其主要思想了:

#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "wordfilter.h"


struct token{ 
    char   code;       //字符的编码     
    struct token   **children;       //子节点
    uint32_t       children_size;  //子节点的数量
    uint8_t        end;          //是否一个word的结尾
};


typedef struct wordfilter{
    struct token * tokarry[256];
}*wordfilter_t;

struct token *inserttoken(struct token *tok,char c)     
{
    struct token *child = calloc(1,sizeof(*child));
    child->code = c;
    if(tok->children_size == 0){
        tok->children = calloc(tok->children_size+1,sizeof(child));
        tok->children[0] = child;
    }else{
        struct token **tmp = calloc(tok->children_size+1,sizeof(*tmp));
        int i = 0;
        int flag = 0;
        for(; i < tok->children_size; ++i){
            if(!flag && tok->children[i]->code > c){
                tmp[i] = child;
                flag = 1;
            }else
                tmp[i] = tok->children[i];
        }
        if(!flag) 
            tmp[tok->children_size] = child;
        else
            tmp[tok->children_size] = tok->children[tok->children_size-1];
        free(tok->children);
        tok->children = tmp;
    }
    tok->children_size++;
    return child;    
}     

static struct token *getchild(struct token *tok,char c)     
{   
    
    if(!tok->children_size) return NULL;
    int left = 0;
    int right = tok->children_size - 1;
    for( ; ; )
    {
        if(right - left <= 0)
            return tok->children[left]->code == c ? tok->children[left]:NULL; 
        int index = (right - left)/2 + left;
        if(tok->children[index]->code == c)
            return tok->children[index];
        else if(tok->children[index]->code > c)
            right = index-1;
        else
            left = index+1;
    } 
}


static struct token *addchild(struct token *tok,char c){
    struct token *child = getchild(tok,c);
    if(!child)
        return inserttoken(tok,c);
    return child;
}

static void NextChar(struct token *tok,const char *str,int i,int *maxmatch)     
{ 
    if(str[i] == 0) return;      
    struct token *childtok = getchild(tok,str[i]);  
    if(childtok)     
    {     
        if(childtok->end)     
            *maxmatch = i + 1;     
        NextChar(childtok,str,i+1,maxmatch);     
    }
    else{
        if(tok->end)
            *maxmatch = i;
    }
}   


static uint8_t processWord(wordfilter_t filter,const char *str,int *pos)     
{   
    struct token *tok = filter->tokarry[(uint8_t)str[*pos]];
    if(tok == NULL)
    {
        (*pos) += 1;
        return 0;
    }else{
        int maxmatch = 0;     
        NextChar(tok,str,(*pos)+1,&maxmatch);                      
        if(maxmatch == 0)     
        {     
            (*pos) += 1;
            if(tok->end)
                return 1;
            return 0;     
        }     
        else     
        {     
            (*pos) = maxmatch;     
            return 1;     
        }   
    }
    return 0;
}

wordfilter_t wordfilter_new(const char **forbidwords){
    wordfilter_t filter = calloc(1,sizeof(*filter));
    int i = 0;
    for(;forbidwords[i] != NULL; ++i){
        const char *str = forbidwords[i];
        int size = strlen(str);
        struct token *tok = filter->tokarry[(uint8_t)str[0]];
        if(!tok){
            tok = calloc(1,sizeof(*tok));
            tok->code = str[0];
            filter->tokarry[(uint8_t)str[0]] = tok;
        } 
        int j = 1;
        for(; j < size;++j)     
            tok = addchild(tok,str[j]);
        tok->end = 1; 
    }
    return filter;
}     

uint8_t isvaildword(wordfilter_t filter,const char *str)
{
    uint8_t ret = 1;
    //首先将srt从const char *转换成_char*
    int size = strlen(str);
    int i = 0;
    for(; i < size;)     
    {       
        if(processWord(filter,str,&i)){
            ret = 0;
            break;
        }
    } 
    return ret;
}

string_t wordfiltrate(wordfilter_t filter,const char *str,char replace){
    int size = strlen(str);
    int i,j;    
    char *tmp = calloc(1,size+1);
    strcpy(tmp,str);
    for(i = 0; i < size;)     
    {     
        int o = i;     
        if(processWord(filter,str,&i)){       
             j = o;           
             for(; j < i; ++j) tmp[j] = replace;
        }
    }
    
    string_t ret = new_string(tmp);
    //将连续的replace符号合成1个
    int flag = 0;
    j = 0;
    for(i = 0; i < size; ++i){
        if(tmp[i] == replace){
            if(!flag){
                flag = 1;
                ++j;
            }
        }else{
            ((char*)to_cstr(ret))[j++] = tmp[i];
            if(flag) flag = 0;
        }
    }
    free(tmp);
    ((char*)to_cstr(ret))[j] = 0; 
    return ret;
       
}  

 

 

 

 

posted @ 2010-01-11 23:48  sniperHW  阅读(3471)  评论(2)    收藏  举报