我决定添加这个答案,不是因为它是您问题的最佳解决方案,而是为了说明两种可能的解决方案,它们相对简单,并且与您自己正在遵循的方法有些类似。
下面的示例(非优化)提供了一个极其简单的前缀树实现,它使用每个已消耗字符的节点。
public class SimplePrefixTrie
{
private readonly Node _root = new Node();
private class Node
{
public Dictionary<char, Node> Children;
public bool IsTerminal;
public Node Find(string word, int index)
{
var child = default(Node);
if (index < word.Length && Children != null)
Children.TryGetValue(word[index], out child);
return child;
}
public Node Add(string word, int toConsume)
{
var child = default(Node);
if (toConsume == word.Length)
this.IsTerminal = true;
else if (Children == null || !Children.TryGetValue(word[toConsume], out child))
{
if (Children == null)
Children = new Dictionary<char, Node>();
Children[word[toConsume]] = child = new Node();
}
return child;
}
}
public void AddWord(string word)
{
var ndx = 0;
var cur = _root;
while (cur != null)
cur = cur.Add(word, ndx++);
}
public IEnumerable<string> FindWordsMatchingPrefixesOf(string searchWord)
{
var ndx = 0;
var cur = _root;
while (cur != null)
{
if (cur.IsTerminal)
yield return searchWord.Substring(0, ndx);
cur = cur.Find(searchWord, ndx++);
}
}
}
下面还添加了一个简单的压缩前缀树实现。它与上面的示例几乎相同,但存储共享前缀部分而不是单个字符。当现有存储的前缀变为共享并需要拆分成两个部分时,它会分裂节点。
public class SimpleCompressedPrefixTrie
{
private readonly Node _root = new Node();
private class Node
{
private Dictionary<char, Node> _children;
public string PrefixValue = string.Empty;
public bool IsTerminal;
public Node Add(string word, ref int startIndex)
{
var n = FindSharedPrefix(word, startIndex);
startIndex += n;
if (n == PrefixValue.Length)
{
if (startIndex == word.Length)
IsTerminal = true;
else
return AddToChild(word, ref startIndex);
}
else
SplittingAdd(word, n, ref startIndex);
return null;
}
public Node Find(string word, ref int startIndex, out int matchLen)
{
var n = FindSharedPrefix(word, startIndex);
startIndex += n;
matchLen = -1;
if (n == PrefixValue.Length)
{
if (IsTerminal)
matchLen = startIndex;
var child = default(Node);
if (_children != null && startIndex < word.Length && _children.TryGetValue(word[startIndex], out child))
{
startIndex++;
return child;
}
}
return null;
}
private Node AddToChild(string word, ref int startIndex)
{
var key = word[startIndex++];
var nextNode = default(Node);
if (_children == null)
_children = new Dictionary<char, Node>();
else if (_children.TryGetValue(key, out nextNode))
return nextNode;
var remainder = word.Substring(startIndex);
_children[key] = new Node() { PrefixValue = remainder, IsTerminal = true };
return null;
}
private void SplittingAdd(string word, int n, ref int startIndex)
{
var curChildren = _children;
_children = new Dictionary<char, Node>();
_children[PrefixValue[n]] = new Node()
{
PrefixValue = this.PrefixValue.Substring(n + 1),
IsTerminal = this.IsTerminal,
_children = curChildren
};
PrefixValue = PrefixValue.Substring(0, n);
IsTerminal = startIndex == word.Length;
if (!IsTerminal)
{
var prefix = word.Length > startIndex + 1 ? word.Substring(startIndex + 1) : string.Empty;
_children[word[startIndex]] = new Node() { PrefixValue = prefix, IsTerminal = true };
startIndex++;
}
}
private int FindSharedPrefix(string word, int startIndex)
{
var n = Math.Min(PrefixValue.Length, word.Length - startIndex);
var len = 0;
while (len < n && PrefixValue[len] == word[len + startIndex])
len++;
return len;
}
}
public void AddWord(string word)
{
var ndx = 0;
var cur = _root;
while (cur != null)
cur = cur.Add(word, ref ndx);
}
public IEnumerable<string> FindWordsMatchingPrefixesOf(string searchWord)
{
var startNdx = 0;
var cur = _root;
while (cur != null)
{
var matchLen = 0;
cur = cur.Find(searchWord, ref startNdx, out matchLen);
if (matchLen > 0)
yield return searchWord.Substring(0, matchLen);
};
}
}
使用示例:
var trie = new SimplePrefixTrie();
trie.AddWord("hello");
trie.AddWord("iced");
trie.AddWord("i");
trie.AddWord("ice");
trie.AddWord("icecone");
trie.AddWord("dtgg");
trie.AddWord("hicet");
foreach (var w in trie.FindWordsMatchingPrefixesOf("icedtgg"))
Console.WriteLine(w);
带有输出:
i
ice
iced
更新:选择正确的数据结构很重要
我认为更新一下可以提供一些价值,以说明选择适合问题的数据结构是多么重要,以及涉及哪些折衷。因此,我创建了一个小型基准应用程序,测试到目前为止在这个问题中提供的策略与基线参考实现之间的差异。
- Naive:是最简单、最朴素的解决方案。
- JimMischel:基于这个答案的方法。
- MattyMerrix:基于你自己的答案在这里。
- JimMattyDSL:结合了“JimMischel”和“MattyMerrix”的方法,并在排序列表中使用更优化的二进制字符串搜索。
- SimpleTrie和CompessedTrie基于这个答案中描述的两种实现。
完整的基准代码可以在
this gist中找到。使用包含10,000、100,000和1,000,000(随机生成的字符序列)单词的字典运行该代码,并搜索所有前缀匹配项的结果如下:
将5000个单词与25个字符以内的10000个术语的字典进行匹配。
Method Memory (MB) Build Time (s) Lookup Time (s)
Naive 0.64-0.64, 0.64 0.001-0.002, 0.001 6.136-6.312, 6.210
JimMischel 0.84-0.84, 0.84 0.013-0.018, 0.016 0.083-0.113, 0.102
JimMattyDSL 0.80-0.81, 0.80 0.013-0.018, 0.016 0.008-0.011, 0.010
SimpleTrie 24.55-24.56, 24.56 0.042-0.056, 0.051 0.002-0.002, 0.002
CompessedTrie 1.84-1.84, 1.84 0.003-0.003, 0.003 0.002-0.002, 0.002
MattyMerrix 0.83-0.83, 0.83 0.017-0.017, 0.017 0.034-0.034, 0.034
将5000个单词与最大长度为25的100000个术语的字典进行匹配
Method Memory (MB) Build Time (s) Lookup Time (s)
Naive 6.01-6.01, 6.01 0.024-0.026, 0.025 65.651-65.758, 65.715
JimMischel 6.32-6.32, 6.32 0.232-0.236, 0.233 1.208-1.254, 1.235
JimMattyDSL 5.95-5.96, 5.96 0.264-0.269, 0.266 0.050-0.052, 0.051
SimpleTrie 226.49-226.49, 226.49 0.932-0.962, 0.951 0.004-0.004, 0.004
CompessedTrie 16.10-16.10, 16.10 0.101-0.126, 0.111 0.003-0.003, 0.003
MattyMerrix 6.15-6.15, 6.15 0.254-0.269, 0.259 0.414-0.418, 0.416
将5000个单词与最长长度为25的100万个字典条目匹配
Method Memory (MB) Build Time (s) Lookup Time (s)
JimMischel 57.69-57.69, 57.69 3.027-3.086, 3.052 16.341-16.415, 16.373
JimMattyDSL 60.88-60.88, 60.88 3.396-3.484, 3.453 0.399-0.400, 0.399
SimpleTrie 2124.57-2124.57, 2124.57 11.622-11.989, 11.860 0.006-0.006, 0.006
CompessedTrie 166.59-166.59, 166.59 2.813-2.832, 2.823 0.005-0.005, 0.005
MattyMerrix 62.71-62.73, 62.72 3.230-3.270, 3.251 6.996-7.015, 7.008
如您所见,(非空间优化的)Trie 字典树所需的内存要高得多。对于所有测试实现,它都会随着字典大小增加而增加,为 O(N)。
正如预期的那样,Trie 字典树的查找时间基本上是恒定的:O(k),仅取决于搜索词的长度。对于其他实现,时间将根据要搜索的字典大小而增加。
请注意,可以构建更优化的实现来解决此问题,这些实现将接近于 O(k) 的搜索时间,并允许更紧凑的存储和减少的内存占用。如果映射到缩小的字母表(例如仅限于 'A'-'Z'),也可以利用这一点。
{key: {word_list}
的格式,并且每个键的值都是一个单词集合,那么每次查找时你将会拥有O(1)的查找时间,前提是没有太多的碰撞。 - kylieCatt