字典树

字典树：又称为Trie，是一种用于快速检索的多叉树结构。Trie把要查找的关键词看作一个字符序列，并根据构成关键词字符的先后顺序构造用于检索的树结构；一棵m度的Trie树或者为空，或者由m棵m度的Trie树构成。

在Trie树中查找一个关键字的时间和树中包含的结点数无关，而取决于组成关键字的字符数。

如果要查找的关键字可以分解成字符序列且不是很长，利用Trie树查找速度优于二叉查找树。

若关键字长度最大是5，则利用Trie树，利用5次比较可以从26⁵＝11881376个可能的关键字中检索出指定的关键字。而利用二叉查找树至少要进行log₂26⁵=23.5次比较。

字典树的基本功能是用来查询某个单词（前缀）在所有单词中出现次数的一种数据结构，它的插入和查询复杂度都为O(len),Len为单词（前缀）长度，但是它的空间复杂度却非常高，如果字符集是26个字母，那每个节点的度就有26个，典型的以空间换时间结构。

字典树基本模板：

#define MAX 26 //字符集大小

typedef struct TrieNode

{

int nCount; //记录该字符出现次数

struct TrieNode *next[MAX];

}TrieNode;

TrieNode Memory[1000000];

int allocp = 0;

/*初始化*/

void InitTrieRoot(TrieNode **pRoot)

{

*pRoot = NULL;

}

/*创建新结点*/

TrieNode *CreateTrieNode()

{

int i;

TrieNode *p;

p = &Memory[allocp++];

p->nCount = 1;

for(i = 0 ; i < MAX ; i++)

{

p->next[i] = NULL;

}

return p;

}

/*插入*/

void InsertTrie(TrieNode **pRoot , char *s)

{

int i , k;

TrieNode *p;

if(!(p = *pRoot))

{

p = *pRoot = CreateTrieNode();

}

i = 0;

while(s[i])

{

k = s[i++] - 'a'; //确定branch

if(p->next[k])

p->next[k]->nCount++;

else

p->next[k] = CreateTrieNode();

p = p->next[k];

}

//查找

int SearchTrie(TrieNode **pRoot , char *s)

{

TrieNode *p;

int i , k;

if(!(p = *pRoot))

{

return 0;

}

i = 0;

while(s[i])

{

k = s[i++] - 'a';

if(p->next[k] ==NULL) return 0;

p = p->next[k];

}

return p->nCount;

}

统计难题(这里都用数组分配结点，用malloc分配太慢了）这题就是统计一组字符串中某前缀出现次数（字典树第一类应用），因此只要简单的套模板就行了（在节点中设置一个成员变量nCount，来记录该字符出现次数）

#include <stdio.h>

#define MAX 26

typedef struct TrieNode

{

int nCount;

struct TrieNode *next[MAX];

}TrieNode;

TrieNode Memory[1000000];

int allocp = 0;

void InitTrieRoot(TrieNode **pRoot)

{

*pRoot = NULL;

}

TrieNode *CreateTrieNode()

{

int i;

TrieNode *p;

p = &Memory[allocp++];

p->nCount = 1;

for(i = 0 ; i < MAX ; i++)

{

p->next[i] = NULL;

}

return p;

}

void InsertTrie(TrieNode **pRoot , char *s)

{

int i , k;

TrieNode *p;

if(!(p = *pRoot))

{

p = *pRoot = CreateTrieNode();

}

i = 0;

while(s[i])

{

k = s[i++] - 'a'; //确定branch

if(p->next[k])

p->next[k]->nCount++;

else

p->next[k] = CreateTrieNode();

p = p->next[k];

}

int SearchTrie(TrieNode **pRoot , char *s)

{

TrieNode *p;

int i , k;

if(!(p = *pRoot))

{

return 0;

}

i = 0;

while(s[i])

{

k = s[i++] - 'a';

if(p->next[k] ==NULL) return 0;

p = p->next[k];

}

return p->nCount;

}

int main(void)

{

char s[11];

TrieNode *Root = NULL;

InitTrieRoot(&Root);

while(gets(s) &&s[0])

{

InsertTrie(&Root , s);

}

while(gets(s))

{

printf("%d\n",SearchTrie(&Root , s));

}

return 0;

}

另外，下面是一个字典树的变种，数的每个节点不再存储字符，而是单词，利用strcmp，形成一个排序二叉树，利用这个结构，可以统计词频：

#include <stdio.h>

#include <ctype.h>

#include <string.h>

#include <stdlib.h>

#define MAXWORD 100

int open(char*Vocabulary,int mode);

struct tnode{ //树的节点

char *word; //指向单词的指针

int count; //单词出现的次数

struct tnode *left; //左子节点

struct tnode *right; //右子节点

};

struct tnode *addtree(struct tnode *,char *);

void treeprint(struct tnode *);

int getword(char *,int);

//单词出现频率的统计

main()

{

struct tnode *root;

char word[MAXWORD];

root = NULL;

while(getword(word,MAXWORD)!=EOF)

if(isalpha(word[0]))

root=addtree(root,word);

treeprint(root);

return 0;

}

struct tnode *talloc(void);

//char *strdup(char *s);

//addtree函数:在p的位置或者P的下方增加一个W节点

struct tnode *addtree(struct tnode *p,char *w)

{

int cond;

if(p==NULL){ //该单词是一个新单词

p=talloc(); //创建一个新节点

p->word=strdup(w);

p->count=1;

p->left=p->right=NULL;

}elseif((cond=strcmp(w,p->word))==0)

p->count++; //新单词与节点中的单词匹配

else if(cond<0) //如果小于该节点中的单词，则进入左子树

p->left=addtree(p->left,w);

else //如果大于该节点的单词，则进入右子树

p->right=addtree(p->right,w);

return p;

}

//treeprint函数:按序列打印树P

void treeprint(struct tnode *p)

{

if(p!=NULL){

treeprint(p->left); //左子树

printf("%6d %s%\n",p->count,p->word); //本身

treeprint(p->right); //右子树

}

//getword:get next word or character input

int getword(char *word,int lim)

{

int c,getch(void);

void ungetch(int);

char *w=word;

while(isspace(c=getch()))

;

if(c!=EOF)

*w++=c;

if(!isalpha(c)){

*w='\0';

return c;

}

for(;--lim>0;w++)

if(!isalnum(*w=getch())){

ungetch(*w);

break;

}

*w='\0';

return word[0];

}

#define BUFSIZE 100

char buf[BUFSIZE];

int bufp=0;

int getch(void)

{

return(bufp>0)?buf[--bufp]:getchar();

}

void ungetch(int c)

{

if(bufp>=BUFSIZE)

printf("ungetch:toomany charactors\n");

else

buf[bufp++]=c;

}

#include <stdlib.h>

//talloc函数:创建一个tnode

struct tnode *talloc(void)

{

return (struct tnode*)malloc(sizeof(struct tnode));

}

节选自：http://www.cnblogs.com/DiaoCow/archive/2010/04/19/1715337.html

posted @ 2012-11-20 19:29 gqtc 阅读(120) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

程序员的自我修养

字典树

公告