一种基于自动机的快速分词方法

分词是自然语言处理入门的第一步，我参照导师的论文和写了一个基于自动机的快速分词方法；

其主要内容是字典的构建，字典的结构能够决定分词的效率；

本系统分词的实现，第一个字的查找采用的是hash，分词的时间复杂度为O(1);第二个字的查找是用二分查找实现的，时间复杂度为1+log2(n);其余部分采用的是顺序查找，整体的时间复杂度跟词的长度有关。

实现的框架如下：

const int START1 = 0XB0, START2 = 0XA1, END1 = 0XF8, END2 = 0XFF;   //不用说，这个对于自然语言处理的人都知道是什么意思；
const int MAXWORDLEN = 48;    //读文件的长度；

struct ThirdWord   //除了第一二个字之外，其他字存储的数据节点；
{
    string key;   //存的是这个字；
    bool isPhrase;  //以这个字结尾有没有形成一个词？
    ThirdWord *L,*R;   //L表示的是与该节点同一层次的节点的指针，R表示的是该节点的下一个字，或者说是当前词中这个字的下一个字；
    ThirdWord(string Key, bool IsPhrase = false, ThirdWord* l = 0, ThirdWord* r = 0):   //构造函数咯；
        key(Key), isPhrase(IsPhrase), L(l), R(r) {}
};

struct SecondWord  //第二个字节点
{
    string key;    //存这个字　　
    bool isPhrase;   //... ...
    ThirdWord *child;   //指向儿子节点，也就是它后面的字节点；
    SecondWord(string Key,bool IsPhrase ,ThirdWord* Child):
        key(Key),isPhrase(IsPhrase),child(Child) {}
};

struct HeadWord
{
    string key;     //第一个字；
    vector<SecondWord> secWord;   //第二个字的表；
};

class Dictionary   //字典类；
{
private :
    ifstream fin;  //读文件的；
    vector <HeadWord> head;  //第一个字表；
    int HASH[END1-START1+1][END2-START2+1];  //哈希表，存的是当前字在head中对应的下标；

    int getNumber();　　//读书字；
    bool IsCC(char c);　　//是中文字；
    bool IsEC(char c);　　//是英文字；
    string getLine();　　//读一行；
    void loadDictionary();  //加载字典；
    int StrtoInt(string s);   //转化，string到int；
    unsigned CharToInt(char c);  //转化；
    int biSearch(unsigned x,string secWd);  //二分查找；
    ThirdWord* seqSearch(ThirdWord* p, string cc);  //第三个字开始的查找；
    void printRemain(string s,ThirdWord* &child);   //打印第二个字后面的；
    void processRemain(string s , ThirdWord* &child);   //处理第二个字后面的；
    void skipNoChinese(string s, vector<string> &test,unsigned &sp, unsigned &ep);    //跳过非中文开头的字；
public :
    Dictionary(string file);   //构造字典；
    void print();　　//打印；
    void segment(string s,vector<string> & test);　　//分词；
};

posted on 2011-11-10 20:02 _Clarence 阅读(241) 评论(0) 收藏举报

刷新页面返回顶部

一种基于自动机的快速分词方法

导航

公告