trie练习

 帮msdn朋友写的.  

 

应用背景

随着互相网信息急剧地增加,要在互联网中检索到自己想要的信息变得非常困难。全文搜索引擎的出现,使我们能够在庞大的互联网中检索到自己需要的信息。Google,Baidu是目前文本搜索领域最具代表性的两个高效搜索引擎。如果你在baidu或者google的搜索框中输入ja,就会出现一个候选框,如下所示:

 

候选框中都是以ja为前缀的word,本次课程设计我们就来探索如何解决这样的问题。

此外,当你输入java然后点search的时候,被检索到都是包含java的网页,如下图所示。在这个检索结果中,我们可以把每个网页看成一个Document,这个问题就可以描述为如何快速地在所有的Document中检索到包含java的全部Document。

 

下载

包括: 代码, 测试数据, 程序背景说明

https://files.cnblogs.com/LeeCe/trie.rar

功能

1. 建立trie树字典, 测试数据约120W词(有重复词).

2. 查找trie树某个词, 如果存在, 输出词典中重复次数.

3. 查找所有前缀词, 

4. 查找前缀词中数量最多的前10个

5. 查找字典中出现数量最多的前10个

 

使用方法

  新建“空win32项目“, 引入两个tree文件, 将”vocabulary.txt“字典文件放到项目根目录下, 调用trie_test()即可.

 

参考文献

  trie算法《http://zh.wikipedia.org/zh/Trie

 

代码

//main.cpp
#include "exp_tree.h"

int main()
{
trie_test();
return 1;
}


 

代码
//exp_tree.h
#if !defined __TREE__H__20101229__
#define __TREE__H__20101229__
//////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include
<stdio.h>
#include
<string.h>
#include
<windows.h>
#include
"sys/stat.h"

//////////////////////////////////////////////////////////////////////////
#define MAX_TRIE_SIZE 256
#define MAX_WORD_LENGTH 128
struct trie_node_st{
int count;
int prio;
struct trie_node_st *next[MAX_TRIE_SIZE];
};

struct hot_10_node_st{
int count;
char word[MAX_WORD_LENGTH];
hot_10_node_st
*next;
hot_10_node_st
*pre;
};

struct hot_10_st{
hot_10_node_st
*head;
hot_10_node_st
*tail;
};

struct pre_10_node_st{
int count;
int prio;
char word[MAX_WORD_LENGTH];
pre_10_node_st
*next;
pre_10_node_st
*pre;
};

struct pre_10_st{
pre_10_node_st
*head;
pre_10_node_st
*tail;
};

typedef
struct trie_node_st trie_node;
typedef
struct hot_10_node_st hot_10_node;
typedef
struct hot_10_st hot_10_list;
typedef
struct pre_10_node_st pre_10_node;
typedef
struct pre_10_st pre_10_list;
//////////////////////////////////////////////////////////////////////////
//1
int insert(const char *word, trie_node *troot);
int travel( trie_node *troot );
//2
int find_word( const char *searchword, trie_node *troot, int size);
//3
int find_preword( const char *searchword, trie_node *troot, int n);
//4
int find_pre_10( const char *searchword, trie_node *troot, int n);
int pre_cmp(trie_node *pre1, pre_10_node *pre2);
int built_pre_10();
int insert_pre_10(char *word, int n, trie_node *node);
int show_pre_10();
//5
int find_hot_10(trie_node *troot);
int built_hot_10();
int insert_hot_10(char *word, int n, int num);
int show_hot_10();

int getword(char *word);
int trie_test();
//////////////////////////////////////////////////////////////////////////
#endif


代码
//exp_tree.cpp
#include "exp_tree.h"
//////////////////////////////////////////////////////////////////////////
//static tire_root troot;
//////////////////////////////////////////////////////////////////////////
static char *mfDic, *mfDicCur;
static hot_10_list hot_10= {NULL, NULL};
static pre_10_list pre_10= {NULL, NULL};
static int time= 0;
int insert(const char *word, trie_node *troot)
{
int i= 0;
trie_node
*curr, *newnode;

if ('\0' == word[0])
return 0;
curr
= troot;
for(i= 0;;i++)
{
if(curr->next[word[i]] == NULL)
{
newnode
= (trie_node *)calloc(1, sizeof(trie_node));
curr
->next[word[i]]= newnode;
}

if (word[i] == '\0')
break;

curr
= curr->next[word[i]];
}
curr
->prio= time++;
curr
->count++;
return 0;
}

int travel( trie_node *troot )
{
static int sum= 0, pos= 0;
static char word[128];
int i;

if (troot == NULL)
return 0;

if (troot->count)
{
word[pos]
='\0';
printf(
"%s\n", word);
sum
++;
}

for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos
++]= i;
travel(troot
->next[i]);
pos
--;
}

return 0;
}

int find_word( const char *searchword, trie_node *troot, int size)
{
static int pos= 0;
static char word[128];
int i;

if (troot == NULL)
return 0;

if (troot->count)
{
word[pos]
='\0';
if( strlen(word) == size && strncmp(searchword, word, strlen(word)) == 0)
{
printf(
"%s, %d\n", word, troot->count);
return -1;
}
}

for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos
++]= i;
if(find_word(searchword, troot->next[i], size) < 0) return -1;
pos
--;
}

return 0;
}

int find_preword( const char *searchword, trie_node *troot, int n)
{
static int pos= 0;
static char word[128];
int i;

if (troot == NULL)
return 0;

if (troot->count && strncmp(searchword, word, n) == 0)
{
word[pos]
='\0';
printf(
"%s, %d\n", word, troot->count);
}

for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos
++]= i;
find_preword(searchword, troot
->next[i], n);
pos
--;
}

return 0;
}

int pre_cmp(trie_node *pre1, pre_10_node *pre2)
{
if (pre1->count > pre2->count)
return 1;
if (pre1->count < pre2->count)
return -1;
if (pre1->prio > pre2->prio)
return 1;
if (pre1->prio < pre2->prio)
return -1;
return 0;
}

int build_pre_10()
{
pre_10_node
*pCurr;
int i;

if (pre_10.head == NULL)
pre_10.head
= (pre_10_node *)calloc(1, sizeof(pre_10_node));
else
return 0;

pCurr
= pre_10.head;
for (i= 0; i< 9; i++)
{
if (pCurr->next == NULL)
pCurr
->next= (pre_10_node *)calloc(1, sizeof(pre_10_node));
pCurr
->next->pre= pCurr;
pCurr
= pCurr->next;
}
pre_10.tail
= pCurr;

return 0;
}

int insert_pre_10(char *word, int n, trie_node *node)
{
int i;
pre_10_node
*pCurr, *pTail;
char tempword[MAX_WORD_LENGTH];
int tempnum;

build_pre_10();

pCurr
= pre_10.head;
for (i= 0; i< 10; i++, pCurr= pCurr->next)
{
if (pCurr== pre_10.head && pre_cmp(node, pCurr) > 0)
{
pTail
= pre_10.tail;
pre_10.tail
= pTail->pre;
pTail
->pre->next= NULL;

pTail
->count= node->count;
pTail
->prio= node->prio;
strncpy(pTail
->word, word, n+1);

pTail
->next= pre_10.head;
pre_10.head
->pre= pTail;
pre_10.head
= pTail;
pTail
->pre= NULL;
return 0;
}
if (pCurr== pre_10.tail && pre_cmp(node, pCurr) > 0)
{
pre_10.tail
->count= node->count;
pre_10.tail
->prio= node->prio;
strncmp(pre_10.tail
->word, word, n+1);
return 0;
}
if (pre_cmp(node, pCurr) > 0)
{
pTail
= pre_10.tail;
pre_10.tail
= pTail->pre;
pTail
->pre->next= NULL;

pTail
->count= node->count;
pTail
->prio= node->prio;
strncpy(pTail
->word, word, n+1);

pCurr
->pre->next= pTail;
pTail
->pre= pCurr->pre;
pTail
->next= pCurr;
pCurr
->pre= pTail;
return 0;
}
}

}

int show_pre_10()
{
pre_10_node
*pCurr;

pCurr
= pre_10.head;
printf(
"pre_10_list:\n");
for (int i= 0; i< 10; i++, pCurr= pCurr->next)
printf(
"%s, %d, %d\n", pCurr->word, pCurr->count, pCurr->prio);
printf(
"\n");
return 0;
}

int build_hot_10()
{
hot_10_node
*pCurr;
int i;

if (hot_10.head == NULL)
hot_10.head
= (hot_10_node *)calloc(1, sizeof(hot_10_node));
else
return 0;

pCurr
= hot_10.head;
for (i= 0; i< 9; i++)
{
if (pCurr->next == NULL)
pCurr
->next= (hot_10_node *)calloc(1, sizeof(hot_10_node));
pCurr
->next->pre= pCurr;
pCurr
= pCurr->next;
}
hot_10.tail
= pCurr;

return 0;
}

int insert_hot_10(char *word, int n, int num)
{
int i;
hot_10_node
*pCurr, *pTail;
char tempword[MAX_WORD_LENGTH];
int tempnum;

build_hot_10();

pCurr
= hot_10.head;
for (i= 0; i< 10; i++, pCurr= pCurr->next)
{
if (pCurr== hot_10.head && num > pCurr->count)
{
pTail
= hot_10.tail;
hot_10.tail
= pTail->pre;
pTail
->pre->next= NULL;

pTail
->count= num;
strncpy(pTail
->word, word, n+1);

pTail
->next= hot_10.head;
hot_10.head
->pre= pTail;
hot_10.head
= pTail;
pTail
->pre= NULL;
return 0;
}
if (pCurr== hot_10.tail && num > pCurr->count)
{
hot_10.tail
->count= num;
strncmp(hot_10.tail
->word, word, n+1);
return 0;
}
if (num > pCurr->count)
{
pTail
= hot_10.tail;
hot_10.tail
= pTail->pre;
pTail
->pre->next= NULL;

pTail
->count= num;
strncpy(pTail
->word, word, n+1);

pCurr
->pre->next= pTail;
pTail
->pre= pCurr->pre;
pTail
->next= pCurr;
pCurr
->pre= pTail;
return 0;
}
}
}

int show_hot_10()
{
hot_10_node
*pCurr;

pCurr
= hot_10.head;
printf(
"hot_10_list:\n");
for (int i= 0; i< 10; i++, pCurr= pCurr->next)
printf(
"%s, %d\n", pCurr->word, pCurr->count);
printf(
"\n");
return 0;
}

int find_pre_10( const char *searchword, trie_node *troot, int n)
{
static int pos= 0;
static char word[128];
int i;

if (troot == NULL)
return 0;

if (troot->count && strncmp(searchword, word, n) == 0)
{
word[pos]
='\0';
insert_pre_10(word, strlen(word), troot);
}

for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos
++]= i;
find_pre_10(searchword, troot
->next[i], n);
pos
--;
}

return 0;
}

int find_hot_10(trie_node *troot)
{
static int pos= 0;
static char word[128];
int i;

if (troot == NULL)
return 0;

if (troot->count)
{
word[pos]
='\0';
insert_hot_10(word, strlen(word), troot
->count);
}

for (i= 0; i< MAX_TRIE_SIZE; i++)
{
word[pos
++]= i;
find_hot_10(troot
->next[i]);
pos
--;
}

return 0;
}

int getword(char *word)
{
char *temp;

temp
= strstr(mfDicCur, "\r\n");
if (temp == NULL)
return 0;

memcpy(word, mfDicCur, temp
-mfDicCur);
word[temp
-mfDicCur]= '\0';

mfDicCur
= temp+2;
return 1;
}

int trie_test()
{
char word[MAX_WORD_LENGTH];
struct _stat sStat;
trie_node troot
= {0, {NULL}};
FILE
*f;
int iDicnum= 0;

_stat(
"vocabulary.txt", &sStat);

printf(
"wait read file...\n");
f
= fopen("vocabulary.txt", "rb+");
mfDicCur
= mfDic= (char *)calloc(1, sStat.st_size);
fread(mfDic,
1, sStat.st_size, f);
fclose(f);

printf(
"wait build Dictionary...\n\n");
while (1)
{
if (getword(word) == 0)
break;
iDicnum
++;
insert(word,
&troot);
}
//////////////////////////////////////////////////////////////////////////
char *searchword= "he";
// 1,2
printf("\\\\1,2\n");
printf(
"wait search \"word\": \"%s\"...\n", searchword);
printf(
"result:\n");
if(find_word(searchword, &troot, strlen(searchword)) == 0)
printf(
"no words!!\n");
printf(
"\n");
// 3
printf("\\\\3\n");
printf(
"wait search \"preword\": \"%s\"...\n", searchword);
printf(
"result:\n");
find_preword(searchword,
&troot, strlen(searchword));
printf(
"\n");
// 4
printf("\\\\4\n");
printf(
"wait search \"pre_10_words\":...\n");
printf(
"result:\n");
find_pre_10(searchword,
&troot, strlen(searchword));
show_pre_10();
// 5
printf("\\\\5\n");
printf(
"wait search \"hot_10_words\":...\n");
printf(
"result:\n");
find_hot_10(
&troot);
show_hot_10();
//////////////////////////////////////////////////////////////////////////

free(mfDic);
return 0;
}



 

posted on 2010-12-30 00:03  oleeceo  阅读(301)  评论(1)    收藏  举报

导航