Trie是一种字典树,用于存储文本字符,并利用了单词之间共享前缀的特点,所以叫做前缀树。不像平衡BST,Trie的高度只与最长的文本串的长度s有关系,而与单词的数量n无关。Trie的节点分两种:内部结点和叶子结点,内部结点用来存储单词key的成分字母,如果设字母表大小为d,那么每个内部结点最多有d个孩子,叶子结点存储该单词作为key的数据内容(data)。注意内部结点和叶子结点并不是互斥的,一个内部结点本身可以有儿子结点,同时它也可以是一个叶子结点。例如:
这里ab结点本身就是叶子结点,因为它以#结束符标记了,同时它有两个儿子结点a和c,c是叶子结点。
如果一颗Trie中有很多单词只有一个儿子结点,可以用Patricia Trie(Linux内核中叫做Radix Tree)压缩存储。由于#结束符标记被看作是一个叶子结点,那么一颗Patricia Trie的任何内部结点有2个或以上的孩子结点。
Patricia Trie
的基本操作包括:插入
、删除
和查询
。插入操作可能会涉及到split(拆分)
,删除操作可能会涉及到merge(合并)
。基于Patricia Trie的基本性质,split和merge操作都是局部的,这样实现起来比较简单。详细内容请参考以下代码:
package org.dreams.transaction.tree.patricia;
import java.util.LinkedList;
import java.util.List;
public class PatriciaTrie {
private class PatriciaTrieNode {
private String key;
private Integer data;
private List<PatriciaTrieNode> children = new LinkedList<>();
private boolean terminal; //use "#" for terminal char
public PatriciaTrieNode() {
this.key = "";
}
public PatriciaTrieNode(String key) {
this.key = key;
}
public String toString() {
return this.key + (this.terminal ? "#" : "") + "(" + children.size() + ")";
}
}
private PatriciaTrieNode root;
//return the value of the external node if found;
//otherwise, return null
public Integer find(String key) {
if (key == null || key.length() == 0)
return null;
if (root == null) {
return null;
} else {
return find(root, key);
}
}
private Integer find(PatriciaTrieNode currNode, String key) {
for (int i = 0; i < currNode.children.size(); i++) {
PatriciaTrieNode child = currNode.children.get(i);
//use min(child.key.length, key.length)
int len = child.key.length() < key.length() ? child.key.length() :
key.length();
int j = 0;
for (; j < len; j++) {
if (key.charAt(j) != child.key.charAt(j)) {
break;
}
}
if (j == 0) {
//this child doesn't match any character with the new key
//order keys by lexi-order
if (key.charAt(0) < child.key.charAt(0)) {
//e.g. child="e", key="c" (currNode="abc")
// abc
// / \
// e h
return null;
} else {
//e.g. child="e", key="h" (currNode="abc")
continue;
}
} else {//current child's key partially matches with the new key; 0<j<=len
if (j == len) {
if (key.length() == child.key.length()) {
if (child.terminal) {
//e.g. child="ab", key="ab"
// ab#
// /
// f#
return child.data;
} else {
//e.g. child="ab", key="ab"
// ab
// / /
// e f
return null;
}
} else if (key.length() > child.key.length()) {
//e.g. child="ab#", key="abc"
// ab#
// / /
// a c#
String subkey = key.substring(j); //c
//recursion
return find(child, subkey);
} else { //key.length()<child.key.length()
//e.g. child="abc", key="ab"
// abc
// / /
// e f
return null;
}
} else {
//0<j<len
//e.g. child="abc", key="abd"
// abc
// / /
// e f
return null;
}
}
}
return null;
}
public void delete(String key) throws Exception {
if (key == null || key.length() == 0) return;
if (root == null) {
return;
}
delete(root, key);
}
private void delete(PatriciaTrieNode currNode, String key) throws Exception {
boolean done = false;
for (int i = 0; i < currNode.children.size(); i++) {
PatriciaTrieNode child = currNode.children.get(i);
//use min(child.key.length, key.length)
int len = child.key.length() < key.length() ? child.key.length() :
key.length();
int j = 0;
for (; j < len; j++) {
if (key.charAt(j) != child.key.charAt(j)) {
break;
}
}
if (j == 0) {//this child doesn't match any character with the new key
//order keys by lexi-order
if (key.charAt(0) < child.key.charAt(0)) {
//e.g. child="e", key="c" (currNode="abc")
// abc
// / \
// e h
done = true;
throw new Exception("No such key is found for removal!");
} else {
//e.g. child="e", key="h" (currNode="abc")
continue;
}
} else {//current child's key partially matches with the new key; 0<j<=len
if (j == len) {
if (key.length() == child.key.length()) {
if (child.terminal) {
//found key, delete it
if (child.children.size() == 0) {
//e.g. child="ab#", key="ab", currNode="a"
// a
// / \
// d ab#
currNode.children.remove(i);
//merge node for currNode
if (!currNode.terminal && currNode.children.size() == 1) {
PatriciaTrieNode singleChild = currNode.children.get(0); //d
currNode.key += singleChild.key;
currNode.data = singleChild.data;
currNode.terminal = singleChild.terminal;
currNode.children = singleChild.children;
}
} else { //child.children.size()>=1
//e.g. child="ab#", key="ab", currNode="a"
// a#
// /
// ab#
// /
// f#
child.terminal = false;
//merge node for child
if (child.children.size() == 1) {
PatriciaTrieNode singleChild = child.children.get(0); //f#
child.key += singleChild.key;
child.data = singleChild.data;
child.terminal = singleChild.terminal; //Note: singleChild may not be external node
child.children = singleChild.children;
}
}
} else {
//e.g. child="ab", key="ab"
// ab
// / /
// e f
throw new Exception("No such key is found for removal!");
}
} else if (key.length() > child.key.length()) {
//e.g. child="ab#", key="abc"
// ab#
// / /
// a c#
String subkey = key.substring(j); //c
//recursion
delete(child, subkey);
} else { //key.length()<child.key.length()
//e.g. child="abc", key="ab"
// abc
// / /
// e f
throw new Exception("No such key is found for removal!");
}
} else {//0<j<len
//e.g. child="abc", key="abd"
// abc
// / /
// e f
throw new Exception("No such key is found for removal!");
}
done = true;
break;
}
}
if (!done) {
throw new Exception("No such key is found for removal!");
}
}
//value is only located at the external node
private void insert(PatriciaTrieNode currNode, String key, Integer value) throws Exception {
boolean done = false;
for (int i = 0; i < currNode.children.size(); i++) {
PatriciaTrieNode child = currNode.children.get(i);
// len: 获取key长度最小的,用于判断是否相等
int len = child.key.length() < key.length() ? child.key.length() : key.length();
// j: 相等的key字节
int j = 0;
for (; j < len; j++) {
if (key.charAt(j) != child.key.charAt(j))
break;
}
// 如果 j==0,则说明这个child没有匹配到任何的字符。
if (j == 0) {
// this child doesn't match any character with the new key
// order keys by lexi-order
// 排序
if (key.charAt(0) < child.key.charAt(0)) {
// e.g. child="e" (currNode="abc")
//
// abc abc
// / \ =========> / | \
// e f insert "c" c# e f
PatriciaTrieNode node = new PatriciaTrieNode(key);
currNode.children.add(i, node);
node.terminal = true;
node.data = value;
done = true;
break;
} else {
// key.charAt(0) > child.key.charAt(0)
// don't forget to add the largest new key after iterating all children
continue;
}
} else {//current child's key partially matches with the new key; 0<j<=len
if (j == len) {
if (key.length() == child.key.length()) {
if (child.terminal)
throw new Exception("Duplicate Key is found when insertion!");
else {
//e.g. child="ab"
//
// ab ab#
// / \ =========> / \
// e f insert "ab" e f
child.terminal = true;
child.data = value;
}
} else if (key.length() > child.key.length()) {
// e.g. child="ab#"
//
// ab# ab#
// / \ ==========> / | /
// e f insert "abc" c# e f
String subkey = key.substring(j);
// recursion
insert(child, subkey, value);
} else {
// key.length() < child.key.length()
//
// e.g. child="abc#"
// abc# ab#
// / / =========> /
// e f insert "ab" c#
// / \
// e f
String childSubkey = child.key.substring(j); //c
PatriciaTrieNode subChildNode = new PatriciaTrieNode(childSubkey);
subChildNode.terminal = child.terminal;
subChildNode.data = child.data;
subChildNode.children = child.children; //inherited from parent
child.key = key; //ab
child.terminal = true; //ab#
child.data = value;
child.children = new LinkedList<PatriciaTrieNode>();
child.children.add(subChildNode);
}
} else {
// 0 < j < len
// e.g. child="abc#"
// abc# ab
// / \ ==========> / \
// e f insert "abd" c# d#
// / \
// e f
// split at j
String childSubkey = child.key.substring(j); // c
String subkey = key.substring(j); // d
PatriciaTrieNode subChildNode = new PatriciaTrieNode(childSubkey);
subChildNode.terminal = child.terminal;
subChildNode.data = child.data;
subChildNode.children = child.children; //inherited from parent
//update child's key
child.key = child.key.substring(0, j);
//child is not terminal now due to split, it is inherited by subChildNode
child.terminal = false;
//Note: no need to merge subChildNode
PatriciaTrieNode node = new PatriciaTrieNode(subkey);
node.terminal = true;
node.data = value;
child.children = new LinkedList<PatriciaTrieNode>();
if (subkey.charAt(0) < childSubkey.charAt(0)) {
child.children.add(node);
child.children.add(subChildNode);
} else {
child.children.add(subChildNode);
child.children.add(node);
}
}
done = true;
break;
}
}
if (!done) {
PatriciaTrieNode node = new PatriciaTrieNode(key);
node.terminal = true;
node.data = value;
currNode.children.add(node);
}
}
public void insert(String key, Integer value) throws Exception {
if (key == null || key.length() == 0) return;
if (root == null) {
root = new PatriciaTrieNode();
}
insert(root, key, value);
}
public PatriciaTrieNode getRoot() {
return root;
}
//for test purpose only
public void printTree() {
this.print(0, this.root);
}
private void print(int level, PatriciaTrieNode node) {
for (int i = 0; i < level; i++) {
System.out.format(" ");
}
System.out.format("|");
for (int i = 0; i < level; i++) {
System.out.format("-");
}
if (node.terminal)
System.out.format("%s[%s]#%n", node.key, node.data);
else
System.out.format("%s%n", node.key);
for (PatriciaTrieNode child : node.children) {
print(level + 1, child);
}
}
public void testFind(String key) {
Integer val = this.find(key);
if (val != null)
System.out.format("Found key:[%s] val:[%s] %n", key, val);
else
System.out.format("Found no such key: %s %n", key);
}
public static void main(String[] args) throws Exception {
//test insertion
PatriciaTrie ptrie = new PatriciaTrie();
ptrie.insert("b", 1);
ptrie.insert("ab", 1);
ptrie.insert("abc", 2);
ptrie.insert("abde", 3);
ptrie.insert("abd", 4);
//ptrie.insert("dc");
ptrie.insert("dce", 5);
ptrie.insert("dceh", 6);
ptrie.insert("dceg", 7);
ptrie.insert("dca", 8);
ptrie.insert("dcf", 9);
ptrie.insert("ghk", 10);
ptrie.insert("gh", 11);
ptrie.insert("mns", 12);
ptrie.insert("mnt", 13);
ptrie.insert("mn", 14);
ptrie.insert("mg", 15);
ptrie.printTree();
String key = "dc";
ptrie.testFind(key);
key = "d";
ptrie.testFind(key);
key = "ab";
ptrie.testFind(key);
key = "ef";
ptrie.testFind(key);
key = "zz";
ptrie.testFind(key);
key = "dk";
ptrie.testFind(key);
key = "dcf";
ptrie.testFind(key);
key = "dck";
ptrie.testFind(key);
key = "abd";
ptrie.delete(key);
System.out.format("After delete key: %s%n", key);
ptrie.printTree();
System.out.println("****************************");
ptrie = new PatriciaTrie();
ptrie.insert("bear", 1);
ptrie.insert("bell", 2);
ptrie.insert("bid", 3);
ptrie.insert("bull", 4);
ptrie.insert("buy", 5);
ptrie.insert("sell", 6);
ptrie.insert("stock", 7);
ptrie.insert("stop", 8);
ptrie.printTree();
System.out.println("****************************");
ptrie = new PatriciaTrie();
ptrie.insert("allot", 1);
ptrie.insert("alloy", 2);
ptrie.insert("all", 3);
ptrie.insert("aloe", 4);
ptrie.insert("ant", 5);
ptrie.insert("an", 6);
ptrie.insert("are", 7);
ptrie.insert("ate", 8);
ptrie.insert("be", 9);
ptrie.printTree();
System.out.println("****************************");
ptrie = new PatriciaTrie();
ptrie.insert("minimize", 0);
ptrie.insert("mize", 4);
ptrie.insert("ze", 6);
ptrie.insert("nimize", 2);
ptrie.insert("ize", 5);
ptrie.insert("inimize", 1);
ptrie.insert("imize", 3);
ptrie.insert("e", 7);
ptrie.printTree();
key = "ize";
ptrie.testFind(key);
key = "zee";
ptrie.testFind(key);
key = "mize";
ptrie.testFind(key);
key = "mize";
ptrie.delete(key);
System.out.format("After delete key: %s%n", key);
ptrie.printTree();
}
}
测试结果:
|
|-ab[1]#
|--c[2]#
|--d[4]#
|---e[3]#
|-dc
|--a[8]#
|--e[5]#
|---g[7]#
|---h[6]#
|--f[9]#
|-gh[11]#
|--k[10]#
|-m
|--g[15]#
|--n[14]#
|---s[12]#
|---t[13]#
Found no such key: "dc"
Found no such key: "d"
Found key "ab" at: 1
Found no such key: "ef"
Found no such key: "zz"
Found no such key: "dk"
Found key "dcf" at: 9
Found no such key: "dck"
After delete key: abd
|
|-ab[1]#
|--c[2]#
|--de[3]#
|-dc
|--a[8]#
|--e[5]#
|---g[7]#
|---h[6]#
|--f[9]#
|-gh[11]#
|--k[10]#
|-m
|--g[15]#
|--n[14]#
|---s[12]#
|---t[13]#
****************************
|
|-b
|--e
|---ar[1]#
|---ll[2]#
|--id[3]#
|--u
|---ll[4]#
|---y[5]#
|-s
|--ell[6]#
|--to
|---ck[7]#
|---p[8]#
****************************
|
|-a
|--l
|---l[3]#
|----o
|-----t[1]#
|-----y[2]#
|---oe[4]#
|--n[6]#
|---t[5]#
|--re[7]#
|--te[8]#
|-be[9]#
****************************
|
|-e[7]#
|-i
|--mize[3]#
|--nimize[1]#
|--ze[5]#
|-mi
|--nimize[0]#
|--ze[4]#
|-nimize[2]#
|-ze[6]#
Found key "ize" at: 5
Found no such key: "zee"
Found key "mize" at: 4
After delete key: mize
|
|-e[7]#
|-i
|--mize[3]#
|--nimize[1]#
|--ze[5]#
|-minimize[0]#
|-nimize[2]#
|-ze[6]#
插入操作j < len这种情况的注释是不是写错了?f应该是c#的孩子吧?
现在有点忙,稍后我看看。