trie练习
帮msdn朋友写的.
应用背景
随着互相网信息急剧地增加,要在互联网中检索到自己想要的信息变得非常困难。全文搜索引擎的出现,使我们能够在庞大的互联网中检索到自己需要的信息。Google,Baidu是目前文本搜索领域最具代表性的两个高效搜索引擎。如果你在baidu或者google的搜索框中输入ja,就会出现一个候选框,如下所示:

候选框中都是以ja为前缀的word,本次课程设计我们就来探索如何解决这样的问题。
此外,当你输入java然后点search的时候,被检索到都是包含java的网页,如下图所示。在这个检索结果中,我们可以把每个网页看成一个Document,这个问题就可以描述为如何快速地在所有的Document中检索到包含java的全部Document。

下载
包括: 代码, 测试数据, 程序背景说明
https://files.cnblogs.com/LeeCe/trie.rar
功能
1. 建立trie树字典, 测试数据约120W词(有重复词).
2. 查找trie树某个词, 如果存在, 输出词典中重复次数.
3. 查找所有前缀词,
4. 查找前缀词中数量最多的前10个
5. 查找字典中出现数量最多的前10个
使用方法
新建“空win32项目“, 引入两个tree文件, 将”vocabulary.txt“字典文件放到项目根目录下, 调用trie_test()即可.
参考文献
trie算法《http://zh.wikipedia.org/zh/Trie》
代码
//main.cpp#include "exp_tree.h"
int main()
{
trie_test();
return 1;
}
代码
//exp_tree.h
#if !defined __TREE__H__20101229__
#define __TREE__H__20101229__
//////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <windows.h>
#include "sys/stat.h"
//////////////////////////////////////////////////////////////////////////
#define MAX_TRIE_SIZE 256
#define MAX_WORD_LENGTH 128
struct trie_node_st{
int count;
int prio;
struct trie_node_st *next[MAX_TRIE_SIZE];
};
struct hot_10_node_st{
int count;
char word[MAX_WORD_LENGTH];
hot_10_node_st *next;
hot_10_node_st *pre;
};
struct hot_10_st{
hot_10_node_st *head;
hot_10_node_st *tail;
};
struct pre_10_node_st{
int count;
int prio;
char word[MAX_WORD_LENGTH];
pre_10_node_st *next;
pre_10_node_st *pre;
};
struct pre_10_st{
pre_10_node_st *head;
pre_10_node_st *tail;
};
typedef struct trie_node_st trie_node;
typedef struct hot_10_node_st hot_10_node;
typedef struct hot_10_st hot_10_list;
typedef struct pre_10_node_st pre_10_node;
typedef struct pre_10_st pre_10_list;
//////////////////////////////////////////////////////////////////////////
//1
int insert(const char *word, trie_node *troot);
int travel( trie_node *troot );
//2
int find_word( const char *searchword, trie_node *troot, int size);
//3
int find_preword( const char *searchword, trie_node *troot, int n);
//4
int find_pre_10( const char *searchword, trie_node *troot, int n);
int pre_cmp(trie_node *pre1, pre_10_node *pre2);
int built_pre_10();
int insert_pre_10(char *word, int n, trie_node *node);
int show_pre_10();
//5
int find_hot_10(trie_node *troot);
int built_hot_10();
int insert_hot_10(char *word, int n, int num);
int show_hot_10();
int getword(char *word);
int trie_test();
//////////////////////////////////////////////////////////////////////////
#endif
代码
//exp_tree.cpp
#include "exp_tree.h"
//////////////////////////////////////////////////////////////////////////
//static tire_root troot;
//////////////////////////////////////////////////////////////////////////
static char *mfDic, *mfDicCur;
static hot_10_list hot_10= {NULL, NULL};
static pre_10_list pre_10= {NULL, NULL};
static int time= 0;
int insert(const char *word, trie_node *troot)
{
int i= 0;
trie_node *curr, *newnode;
if ('\0' == word[0])
return 0;
curr= troot;
for(i= 0;;i++)
{
if(curr->next[word[i]] == NULL)
{
newnode= (trie_node *)calloc(1, sizeof(trie_node));
curr->next[word[i]]= newnode;
}
if (word[i] == '\0')
break;
curr= curr->next[word[i]];
}
curr->prio= time++;
curr->count++;
return 0;
}
int travel( trie_node *troot )
{
static int sum= 0, pos= 0;
static char word[128];
int i;
if (troot == NULL)
return 0;
if (troot->count)
{
word[pos]='\0';
printf("%s\n", word);
sum++;
}
for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos++]= i;
travel(troot->next[i]);
pos--;
}
return 0;
}
int find_word( const char *searchword, trie_node *troot, int size)
{
static int pos= 0;
static char word[128];
int i;
if (troot == NULL)
return 0;
if (troot->count)
{
word[pos]='\0';
if( strlen(word) == size && strncmp(searchword, word, strlen(word)) == 0)
{
printf("%s, %d\n", word, troot->count);
return -1;
}
}
for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos++]= i;
if(find_word(searchword, troot->next[i], size) < 0) return -1;
pos--;
}
return 0;
}
int find_preword( const char *searchword, trie_node *troot, int n)
{
static int pos= 0;
static char word[128];
int i;
if (troot == NULL)
return 0;
if (troot->count && strncmp(searchword, word, n) == 0)
{
word[pos]='\0';
printf("%s, %d\n", word, troot->count);
}
for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos++]= i;
find_preword(searchword, troot->next[i], n);
pos--;
}
return 0;
}
int pre_cmp(trie_node *pre1, pre_10_node *pre2)
{
if (pre1->count > pre2->count)
return 1;
if (pre1->count < pre2->count)
return -1;
if (pre1->prio > pre2->prio)
return 1;
if (pre1->prio < pre2->prio)
return -1;
return 0;
}
int build_pre_10()
{
pre_10_node *pCurr;
int i;
if (pre_10.head == NULL)
pre_10.head= (pre_10_node *)calloc(1, sizeof(pre_10_node));
else
return 0;
pCurr= pre_10.head;
for (i= 0; i< 9; i++)
{
if (pCurr->next == NULL)
pCurr->next= (pre_10_node *)calloc(1, sizeof(pre_10_node));
pCurr->next->pre= pCurr;
pCurr= pCurr->next;
}
pre_10.tail= pCurr;
return 0;
}
int insert_pre_10(char *word, int n, trie_node *node)
{
int i;
pre_10_node *pCurr, *pTail;
char tempword[MAX_WORD_LENGTH];
int tempnum;
build_pre_10();
pCurr= pre_10.head;
for (i= 0; i< 10; i++, pCurr= pCurr->next)
{
if (pCurr== pre_10.head && pre_cmp(node, pCurr) > 0)
{
pTail= pre_10.tail;
pre_10.tail= pTail->pre;
pTail->pre->next= NULL;
pTail->count= node->count;
pTail->prio= node->prio;
strncpy(pTail->word, word, n+1);
pTail->next= pre_10.head;
pre_10.head->pre= pTail;
pre_10.head= pTail;
pTail->pre= NULL;
return 0;
}
if (pCurr== pre_10.tail && pre_cmp(node, pCurr) > 0)
{
pre_10.tail->count= node->count;
pre_10.tail->prio= node->prio;
strncmp(pre_10.tail->word, word, n+1);
return 0;
}
if (pre_cmp(node, pCurr) > 0)
{
pTail= pre_10.tail;
pre_10.tail= pTail->pre;
pTail->pre->next= NULL;
pTail->count= node->count;
pTail->prio= node->prio;
strncpy(pTail->word, word, n+1);
pCurr->pre->next= pTail;
pTail->pre= pCurr->pre;
pTail->next= pCurr;
pCurr->pre= pTail;
return 0;
}
}
}
int show_pre_10()
{
pre_10_node *pCurr;
pCurr= pre_10.head;
printf("pre_10_list:\n");
for (int i= 0; i< 10; i++, pCurr= pCurr->next)
printf("%s, %d, %d\n", pCurr->word, pCurr->count, pCurr->prio);
printf("\n");
return 0;
}
int build_hot_10()
{
hot_10_node *pCurr;
int i;
if (hot_10.head == NULL)
hot_10.head= (hot_10_node *)calloc(1, sizeof(hot_10_node));
else
return 0;
pCurr= hot_10.head;
for (i= 0; i< 9; i++)
{
if (pCurr->next == NULL)
pCurr->next= (hot_10_node *)calloc(1, sizeof(hot_10_node));
pCurr->next->pre= pCurr;
pCurr= pCurr->next;
}
hot_10.tail= pCurr;
return 0;
}
int insert_hot_10(char *word, int n, int num)
{
int i;
hot_10_node *pCurr, *pTail;
char tempword[MAX_WORD_LENGTH];
int tempnum;
build_hot_10();
pCurr= hot_10.head;
for (i= 0; i< 10; i++, pCurr= pCurr->next)
{
if (pCurr== hot_10.head && num > pCurr->count)
{
pTail= hot_10.tail;
hot_10.tail= pTail->pre;
pTail->pre->next= NULL;
pTail->count= num;
strncpy(pTail->word, word, n+1);
pTail->next= hot_10.head;
hot_10.head->pre= pTail;
hot_10.head= pTail;
pTail->pre= NULL;
return 0;
}
if (pCurr== hot_10.tail && num > pCurr->count)
{
hot_10.tail->count= num;
strncmp(hot_10.tail->word, word, n+1);
return 0;
}
if (num > pCurr->count)
{
pTail= hot_10.tail;
hot_10.tail= pTail->pre;
pTail->pre->next= NULL;
pTail->count= num;
strncpy(pTail->word, word, n+1);
pCurr->pre->next= pTail;
pTail->pre= pCurr->pre;
pTail->next= pCurr;
pCurr->pre= pTail;
return 0;
}
}
}
int show_hot_10()
{
hot_10_node *pCurr;
pCurr= hot_10.head;
printf("hot_10_list:\n");
for (int i= 0; i< 10; i++, pCurr= pCurr->next)
printf("%s, %d\n", pCurr->word, pCurr->count);
printf("\n");
return 0;
}
int find_pre_10( const char *searchword, trie_node *troot, int n)
{
static int pos= 0;
static char word[128];
int i;
if (troot == NULL)
return 0;
if (troot->count && strncmp(searchword, word, n) == 0)
{
word[pos]='\0';
insert_pre_10(word, strlen(word), troot);
}
for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos++]= i;
find_pre_10(searchword, troot->next[i], n);
pos--;
}
return 0;
}
int find_hot_10(trie_node *troot)
{
static int pos= 0;
static char word[128];
int i;
if (troot == NULL)
return 0;
if (troot->count)
{
word[pos]='\0';
insert_hot_10(word, strlen(word), troot->count);
}
for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos++]= i;
find_hot_10(troot->next[i]);
pos--;
}
return 0;
}
int getword(char *word)
{
char *temp;
temp= strstr(mfDicCur, "\r\n");
if (temp == NULL)
return 0;
memcpy(word, mfDicCur, temp-mfDicCur);
word[temp-mfDicCur]= '\0';
mfDicCur= temp+2;
return 1;
}
int trie_test()
{
char word[MAX_WORD_LENGTH];
struct _stat sStat;
trie_node troot= {0, {NULL}};
FILE *f;
int iDicnum= 0;
_stat("vocabulary.txt", &sStat);
printf("wait read file...\n");
f= fopen("vocabulary.txt", "rb+");
mfDicCur= mfDic= (char *)calloc(1, sStat.st_size);
fread(mfDic, 1, sStat.st_size, f);
fclose(f);
printf("wait build Dictionary...\n\n");
while (1)
{
if (getword(word) == 0)
break;
iDicnum++;
insert(word, &troot);
}
//////////////////////////////////////////////////////////////////////////
char *searchword= "he";
// 1,2
printf("\\\\1,2\n");
printf("wait search \"word\": \"%s\"...\n", searchword);
printf("result:\n");
if(find_word(searchword, &troot, strlen(searchword)) == 0)
printf("no words!!\n");
printf("\n");
// 3
printf("\\\\3\n");
printf("wait search \"preword\": \"%s\"...\n", searchword);
printf("result:\n");
find_preword(searchword, &troot, strlen(searchword));
printf("\n");
// 4
printf("\\\\4\n");
printf("wait search \"pre_10_words\":...\n");
printf("result:\n");
find_pre_10(searchword, &troot, strlen(searchword));
show_pre_10();
// 5
printf("\\\\5\n");
printf("wait search \"hot_10_words\":...\n");
printf("result:\n");
find_hot_10(&troot);
show_hot_10();
//////////////////////////////////////////////////////////////////////////
free(mfDic);
return 0;
}

浙公网安备 33010602011771号