简单Trie树与三叉Trie树

 1 package sunfa.tree;
 2 
 3 import java.util.HashMap;
 4 import java.util.Iterator;
 5 import java.util.Map;
 6 
 7 /**
 8  * 参考：http://book.51cto.com/art/201106/269044.htm
 9  * Trie树，又称单词查找树或键树，是一种树形结构，是一种哈希树的变种
10  * 。典型应用是用于统计和排序大量的字符串（但不仅限于字符串），所以经常被搜索引擎系统用于文本词频统计
11  * 。它的优点是：最大限度地减少无谓的字符串比较，查询效率比哈希表高。
12  * 
13  * 网上有许多人错误的把Trie树理解为二叉的，其实Trie树可以是二叉，也可以是多叉的，
14  * 本例建立的就是多叉的Trie树，每个节点的子节点集合是一个HashMap，可以这样理解：
15  * 根节点下面有N个子节点，第K个子节点下面也是N个子节点。
16  * 
17  * Trie树的查询次数和key是有关系的，key的长度决定了树的深度。
18  * Trie是典型的以空间还时间的快速查找树，适合于对速度要求非常高的场景。
19  * 像lucene啊，搜索引擎啊，黑名单啊等就大量使用到Trie树。
20  * 最后不的不说Trie对空间的浪费是及其大的。
21  * 
22  */
23 public class TrieTreeDemo1 {
24 
25     public static void main(String[] args) {
26         Map<String, String> map = new HashMap<String, String>();
27         TrieTreeDemo1 tree = new TrieTreeDemo1();
28         for (int i = 0; i < 20; i++) {
29             map.put("key,value" + i, "value" + i);
30             tree.addWord("key,value" + i, "value" + i);
31         }
32         System.out.println("search:");
33         Iterator<String> it = map.keySet().iterator();
34         while (it.hasNext()) {
35             String key = it.next().toString();
36             System.out.println(tree.search(key));
37         }
38         System.out.println(tree.search("ke1"));
39     }
40 
41     private Entry root = new Entry();
42 
43     public void addWord(String key, String o) {
44         Entry node = root;
45         for (int i = 0; i < key.length(); i++) {
46             char c = key.charAt(i);
47             if (node.children.get(c) == null) {
48                 node.children.put(c, new Entry(c));
49             }
50             node = node.children.get(c);
51         }
52         node.o = o;
53     }
54 
55     public Object search(String key) {
56         Entry node = root;
57         int count = 0;
58         for (int i = 0; i < key.length(); i++) {
59             count++;
60             if (node.children.get(key.charAt(i)) == null)
61                 return null;
62             if (node.children.get(key.charAt(i)).o != null) {
63                 System.out.println("key:" + key + ",count:" + count);
64                 return node.children.get(key.charAt(i)).o;
65             }
66             node = node.children.get(key.charAt(i));
67         }
68         return null;
69     }
70 
71     static class Entry {
72         Map<Character, Entry> children = new HashMap<Character, TrieTreeDemo1.Entry>();
73         char c;
74         String o;
75 
76         public Entry(char c) {
77             this.c = c;
78         }
79 
80         public Entry() {
81         }
82     }
83 }

  1 package sunfa.tree;
  2 
  3 import java.util.HashMap;
  4 import java.util.Iterator;
  5 import java.util.Map;
  6 
  7 /**
  8  * 三叉Trie树 http://book.51cto.com/art/201106/269045.htm
  9  * Trie树应用http://eriol.iteye.com/blog/1166118 三叉Trie树在占用空间上要比N叉树好的多。
 10  * 在一个三叉搜索树（Ternary Search
 11  * Trie）中，每一个节点包括一个字符，但和数字搜索树不同，三叉搜索树只有三个指针：一个指向左边的树；一个指向右边的树
 12  * ；还有一个向下，指向单词的下一个数据单元
 13  * 。三叉搜索树是二叉搜索树和数字搜索树的混合体。它有和数字搜索树差不多的速度但是和二叉搜索树一样只需要相对较少的内存空间。
 14  * 树是否平衡取决于单词的读入顺序。如果按排序后的顺序插入
 15  * ，则生成方式最不平衡。单词的读入顺序对于创建平衡的三叉搜索树很重要，但对于二叉搜索树就不太重要。通过选择一个排序后数据单元集合的中间值
 16  * ，并把它作为开始节点，我们可以创建一个平衡的三叉树。可以写一个专门的过程来生成平衡的三叉树词典。
 17  * 
 18  * Patricia Tree 简称PAT tree。 它是 trie 结构的一种特殊形式。是目前信息检索领域应用十分成功的索引方
 19  * http://hxraid.iteye.com/blog/topic?show_full=true
 20  */
 21 public class TernarySearchTrie {
 22     public static void main(String[] args) {
 23         Map<String, String> map = new HashMap<String, String>();
 24         int size = 20;
 25         TernarySearchTrie tree = new TernarySearchTrie();
 26         for (int i = 0; i < size; i++) {
 27             map.put("tkey" + i, "value" + i);
 28             tree.addWord("tkey" + i);
 29         }
 30         System.out.println("search:");
 31         Iterator<String> it = map.keySet().iterator();
 32         while (it.hasNext()) {
 33             String key = it.next().toString();
 34             Entry node = tree.search(key);
 35             System.out.println(node.data.get("value") + ",查找次数："
 36                     + node.data.get("count"));
 37         }
 38     }
 39 
 40     private Entry root = new Entry();
 41 
 42     public Entry addWord(String key) {
 43         if (key == null || key.trim().length() == 0)
 44             return null;
 45         // 调试的时候发现个问题很是不明白，为什么根节点一开始就有不为NULL的right节点，并且这个right节点的splitchar是k
 46         // 终于发现了，java程序在调试的时候可能存在一个预编译的问题，某些链表式的对象调试的时候DEBUG信息不是很准备，甚至错误，比如链表啊,i++等操作调试就会看到错误的信息，这样的情况用打印语句调试算了。
 47         Entry node = root;
 48         int i = 0;
 49         while (true) {
 50             int diff = key.charAt(i) - node.splitchar;
 51             char c = key.charAt(i);
 52             if (diff == 0) {// 当前单词和上一次的相比较，如果相同
 53                 i++;
 54                 if (i == key.length()) {
 55                     node.data = new HashMap<Object, Object>();
 56                     node.data.put("value", key);
 57                     return node;
 58                 }
 59                 if (node.equals == null)
 60                     node.equals = new Entry(key.charAt(i));// 这里要注意，要获取新的单词填充进去，因为i++了
 61                 node = node.equals;
 62             } else if (diff < 0) {// 没有找到对应的字符，并且下一个左或右节点为NULL，则会一直创建新的节点
 63                 if (node.left == null)
 64                     node.left = new Entry(c);
 65                 node = node.left;
 66             } else {
 67                 if (node.right == null)
 68                     node.right = new Entry(c);
 69                 node = node.right;
 70             }
 71         }
 72     }
 73 
 74     public Entry search(String key) {
 75         if (key == null || key.trim().length() == 0)
 76             return null;
 77         Entry node = root;
 78         int count = 0, i = 0;
 79         while (true) {
 80             if (node == null)
 81                 return null;
 82             int diff = key.charAt(i) - node.splitchar;
 83             count++;
 84             if (diff == 0) {
 85                 i++;
 86                 if (i == key.length()) {
 87                     node.data.put("count", count);
 88                     return node;
 89                 }
 90                 node = node.equals;
 91             } else if (diff < 0) {
 92                 node = node.left;
 93             } else {
 94                 node = node.right;
 95             }
 96         }
 97     }
 98 
 99     /**
100      * 三叉Trie树存在3个节点，左右子节点和二叉树类似，以前key都是存放在二叉树的当前节点中，在三叉树中单词是存放在中间子树的。
101      */
102     static class Entry {
103         Entry left;
104         Entry right;
105         Entry equals;// 比对成功就放到中间节点
106         char splitchar;// 单词
107         Map<Object, Object> data;// 扩展数据域，存放 检索次数，关键码频率等信息。
108 
109         public Entry(char splitchar) {
110             this.splitchar = splitchar;
111         }
112 
113         public Entry() {
114         }
115     }
116 }

posted on 2012-12-30 22:32 王国龙阅读(538) 评论(0) 收藏举报

刷新页面返回顶部