说我有一个100,000字的列表。我想知道给定的字符串是否与该列表中的任何单词匹配,我想以最快的方式进行。此外,我想知道在该字符串中是否有任何其他由第一个字符开头形成的单词出现在列表中。
例如:
假设你有字符串" icedtgg"
" I" " IC" "冰" "冰镇" " icedt" " icedtg" " icedtgg"
我正在尝试提出一种最佳比较算法,告诉我列表中是否包含以下单词。
到目前为止,我的100,000个单词列表存储在
中Dicitonary<char, List<string>> WordList;
其中char
是单词的第一个字符,List<string>
是以该字符开头的所有单词。
所以WordList['a']
列出了以&#39; a&#39;开头的所有单词。 (&#34;猿&#34;,&#34;苹果&#34;,&#34;艺术&#34;等等)和&#39; b&#39;有一个以b等开头的所有单词的列表。
因为我知道我的所有单词都以&#39; i&#39;开头,所以我可以先将我的解决方案从100,000个单词缩小到仅以&#39; i&#39;开头的单词。
List<string> CurrentWordList = WordList['i'];
现在我检查
if( CurrentWordList[0].Length == 1 )
然后我知道我的第一个字符串是匹配&#34;我&#34;因为&#34;我&#34;将是列表中的第一个单词。这些列表是按字母顺序排序的,以免减慢匹配。
有什么想法吗?
*不,这不是硬件分配,我是一名专业的软件架构师,试图为有趣/爱好/游戏开发寻找最佳匹配算法。
答案 0 :(得分:2)
我决定添加这个答案并不是因为它是您问题的最佳解决方案,而是说明两个可能的解决方案,这些解决方案相对简单并且与您似乎遵循自己的方法有些一致。
下面的(非优化的)示例提供了一个极其简单前缀trie 实现,它使用每个消费字符的节点。
public class SimplePrefixTrie
{
private readonly Node _root = new Node(); // root represents empty string.
private class Node
{
public Dictionary<char, Node> Children;
public bool IsTerminal; // whether a full word ends here.
public Node Find(string word, int index)
{
var child = default(Node);
if (index < word.Length && Children != null)
Children.TryGetValue(word[index], out child);
return child;
}
public Node Add(string word, int toConsume)
{
var child = default(Node);
if (toConsume == word.Length)
this.IsTerminal = true;
else if (Children == null || !Children.TryGetValue(word[toConsume], out child))
{
if (Children == null)
Children = new Dictionary<char, Node>();
Children[word[toConsume]] = child = new Node();
}
return child;
}
}
public void AddWord(string word)
{
var ndx = 0;
var cur = _root;
while (cur != null)
cur = cur.Add(word, ndx++);
}
public IEnumerable<string> FindWordsMatchingPrefixesOf(string searchWord)
{
var ndx = 0;
var cur = _root;
while (cur != null)
{
if (cur.IsTerminal)
yield return searchWord.Substring(0, ndx);
cur = cur.Find(searchWord, ndx++);
}
}
}
下面还添加了压缩前缀trie 的简单实现。它遵循与上面示例几乎相同的方法,但存储共享前缀部分,而不是单个字符。它在现有存储的前缀变为共享时拆分节点,并且需要分成两部分。
public class SimpleCompressedPrefixTrie
{
private readonly Node _root = new Node();
private class Node
{
private Dictionary<char, Node> _children;
public string PrefixValue = string.Empty;
public bool IsTerminal;
public Node Add(string word, ref int startIndex)
{
var n = FindSharedPrefix(word, startIndex);
startIndex += n;
if (n == PrefixValue.Length) // full prefix match
{
if (startIndex == word.Length) // full match
IsTerminal = true;
else
return AddToChild(word, ref startIndex);
}
else // partial match, need to split this node's prefix.
SplittingAdd(word, n, ref startIndex);
return null;
}
public Node Find(string word, ref int startIndex, out int matchLen)
{
var n = FindSharedPrefix(word, startIndex);
startIndex += n;
matchLen = -1;
if (n == PrefixValue.Length)
{
if (IsTerminal)
matchLen = startIndex;
var child = default(Node);
if (_children != null && startIndex < word.Length && _children.TryGetValue(word[startIndex], out child))
{
startIndex++; // consumed map key character.
return child;
}
}
return null;
}
private Node AddToChild(string word, ref int startIndex)
{
var key = word[startIndex++]; // consume the mapping character
var nextNode = default(Node);
if (_children == null)
_children = new Dictionary<char, Node>();
else if (_children.TryGetValue(key, out nextNode))
return nextNode;
var remainder = word.Substring(startIndex);
_children[key] = new Node() { PrefixValue = remainder, IsTerminal = true };
return null; // consumed.
}
private void SplittingAdd(string word, int n, ref int startIndex)
{
var curChildren = _children;
_children = new Dictionary<char, Node>();
_children[PrefixValue[n]] = new Node()
{
PrefixValue = this.PrefixValue.Substring(n + 1),
IsTerminal = this.IsTerminal,
_children = curChildren
};
PrefixValue = PrefixValue.Substring(0, n);
IsTerminal = startIndex == word.Length;
if (!IsTerminal)
{
var prefix = word.Length > startIndex + 1 ? word.Substring(startIndex + 1) : string.Empty;
_children[word[startIndex]] = new Node() { PrefixValue = prefix, IsTerminal = true };
startIndex++;
}
}
private int FindSharedPrefix(string word, int startIndex)
{
var n = Math.Min(PrefixValue.Length, word.Length - startIndex);
var len = 0;
while (len < n && PrefixValue[len] == word[len + startIndex])
len++;
return len;
}
}
public void AddWord(string word)
{
var ndx = 0;
var cur = _root;
while (cur != null)
cur = cur.Add(word, ref ndx);
}
public IEnumerable<string> FindWordsMatchingPrefixesOf(string searchWord)
{
var startNdx = 0;
var cur = _root;
while (cur != null)
{
var matchLen = 0;
cur = cur.Find(searchWord, ref startNdx, out matchLen);
if (matchLen > 0)
yield return searchWord.Substring(0, matchLen);
};
}
}
用法示例:
var trie = new SimplePrefixTrie(); // or new SimpleCompressedPrefixTrie();
trie.AddWord("hello");
trie.AddWord("iced");
trie.AddWord("i");
trie.AddWord("ice");
trie.AddWord("icecone");
trie.AddWord("dtgg");
trie.AddWord("hicet");
foreach (var w in trie.FindWordsMatchingPrefixesOf("icedtgg"))
Console.WriteLine(w);
输出:
i
ice
iced
更新:选择正确的数据结构非常重要
我认为更新可以提供一些价值来说明如何选择适合问题的数据结构很重要以及涉及哪些权衡取舍。因此,我创建了一个小型基准测试应用程序,用于测试目前为止提供给该问题的答案中的策略,而不是基准参考实现。
完整的基准代码可以在this gist中找到。使用字典10,000,10000和1,000,000(随机生成的字符序列)单词运行它并搜索5,000个术语的所有前缀匹配的结果是:
将5000个单词与10000个最大长度为25的词典进行匹配
Method Memory (MB) Build Time (s) Lookup Time (s)
Naive 0.64-0.64, 0.64 0.001-0.002, 0.001 6.136-6.312, 6.210
JimMischel 0.84-0.84, 0.84 0.013-0.018, 0.016 0.083-0.113, 0.102
JimMattyDSL 0.80-0.81, 0.80 0.013-0.018, 0.016 0.008-0.011, 0.010
SimpleTrie 24.55-24.56, 24.56 0.042-0.056, 0.051 0.002-0.002, 0.002
CompessedTrie 1.84-1.84, 1.84 0.003-0.003, 0.003 0.002-0.002, 0.002
MattyMerrix 0.83-0.83, 0.83 0.017-0.017, 0.017 0.034-0.034, 0.034
将5000个单词与100000个最大长度为25的词典进行匹配
Method Memory (MB) Build Time (s) Lookup Time (s)
Naive 6.01-6.01, 6.01 0.024-0.026, 0.025 65.651-65.758, 65.715
JimMischel 6.32-6.32, 6.32 0.232-0.236, 0.233 1.208-1.254, 1.235
JimMattyDSL 5.95-5.96, 5.96 0.264-0.269, 0.266 0.050-0.052, 0.051
SimpleTrie 226.49-226.49, 226.49 0.932-0.962, 0.951 0.004-0.004, 0.004
CompessedTrie 16.10-16.10, 16.10 0.101-0.126, 0.111 0.003-0.003, 0.003
MattyMerrix 6.15-6.15, 6.15 0.254-0.269, 0.259 0.414-0.418, 0.416
将5000个单词与1000000个最大长度为25的词典进行匹配
Method Memory (MB) Build Time (s) Lookup Time (s)
JimMischel 57.69-57.69, 57.69 3.027-3.086, 3.052 16.341-16.415, 16.373
JimMattyDSL 60.88-60.88, 60.88 3.396-3.484, 3.453 0.399-0.400, 0.399
SimpleTrie 2124.57-2124.57, 2124.57 11.622-11.989, 11.860 0.006-0.006, 0.006
CompessedTrie 166.59-166.59, 166.59 2.813-2.832, 2.823 0.005-0.005, 0.005
MattyMerrix 62.71-62.73, 62.72 3.230-3.270, 3.251 6.996-7.015, 7.008
如您所见,(非空间优化)尝试所需的内存要高得多。对于所有测试的实现,它增加了字典的大小O(N)。
正如预期的那样,尝试的查找时间或多或少是恒定的:O(k),仅取决于搜索项的长度。对于其他实现,时间将根据要搜索的字典的大小而增加。
请注意,可以构建针对此问题的更优化的实现,对于搜索时间将接近O(k)并允许更紧凑的存储并减少内存占用。如果你映射到缩小的字母表(例如'A' - 'Z'),这也是可以利用的东西。
答案 1 :(得分:1)
所以你只想在字典中找到输入字符串前缀的单词?您可以比任何提出的方法更有效地完成此任务。它实际上只是一个修改过的合并。
如果你的单词列表包含一个用第一个字母键入的字典,每个条目包含一个以该字母开头的单词的排序列表,那么这样就可以了。最坏的情况是O(n + m),其中n是以字母开头的单词数,m是输入字符串的长度。
var inputString = "icegdt";
// get list of words that start with the first character
var wordsList = MyDictionary[input_string[0]];
// find all words that are prefixes of the input string
var iInput = 0;
var iWords = 0;
var prefix = inputString.Substring(0, iInput+1);
while (iInput < inputString.Length && iWords < wordsList.Count)
{
if (wordsList[iWords] == prefix)
{
// wordsList[iWords] is found!
++iWords;
}
else if (wordsList[iWords] > prefix)
{
// The current word is alphabetically after the prefix.
// So we need the next character.
++iInput;
if (iInput < inputString.Length)
{
prefix = inputString.Substring(0, iInput+1);
}
}
else
{
// The prefix is alphabetically after the current word.
// Advance the current word.
++iWord;
}
}
如果这是您想要做的全部(查找作为输入字符串前缀的字典单词),那么您的字典没有特定的理由将第一个字符编入索引。给定一个排序的单词列表,您可以在第一个字母上进行二分查找以找到起点。与字典查找相比,这将花费稍微的时间,但与搜索单词列表中的匹配时间相比,时间差异非常小。此外,排序的单词列表将比字典方法占用更少的内存。
如果要进行不区分大小写的比较,请将比较代码更改为:
var result = String.Compare(wordsList[iWords], prefix, true);
if (result == 0)
{
// wordsList[iWords] is found!
++iWords;
}
else if (result > 0)
{
这也将每次迭代的字符串比较次数减少到每次迭代一次。
答案 2 :(得分:0)
while (x < str.Length-1)
{
if (ChrW(10) == GetChar(str, x) && ChrW(13) == GetChar(str, x+1))
{
// x+2 - This new line
}
x++;
}
答案 3 :(得分:0)
这是我的第一次尝试,想要把它拿出来,以防我今天无法完成它。
public class CompareHelper
{
//Should always be sorted in alphabetical order.
public static Dictionary<char, List<string>> MyDictionary;
public static List<string> CurrentWordList;
public static List<string> MatchedWordList;
//The word we are trying to find matches for.
public static char InitChar;
public static StringBuilder ThisWord;
/// <summary>
/// Initialize the Compare. Set the first character. See if there are any 1 letter words
/// for that character.
/// </summary>
/// <param name="firstChar">The first character in the word string.</param>
/// <returns>True if a word was found.</returns>
public static bool InitCompare(char firstChar)
{
InitChar = firstChar;
//Get all words that start with the firstChar.
CurrentWordList = MyDictionary[InitChar];
ThisWord = new StringBuilder();
ThisWord.Append(firstChar);
if (CurrentWordList[0].Length == 1)
{
//Match.
return true;
}
//No matches.
return false;
}
/// <summary>
/// Append this letter to our ThisWord. See if there are any matching words.
/// </summary>
/// <param name="nextChar">The next character in the word string.</param>
/// <returns>True if a word was found.</returns>
public static bool NextCompare(char nextChar)
{
ThisWord.Append(nextChar);
int currentIndex = ThisWord.Length - 1;
if (FindRemainingWords(nextChar, currentIndex))
{
if (CurrentWordList[0].Length == currentIndex)
{
//Match.
return true;
}
}
//No matches.
return false;
}
/// <summary>
/// Trim down our CurrentWordList until it only contains words
/// that at currIndex start with the currChar.
/// </summary>
/// <param name="currChar">The next letter in our ThisWord.</param>
/// <param name="currIndex">The index of the letter.</param>
/// <returns>True if there are words remaining in CurrentWordList.</returns>
private static bool FindRemainingWords(char currChar, int currIndex)
{
//Null check.
if (CurrentWordList == null || CurrentWordList.Count < 1)
{
return false;
}
bool doneSearching = false;
while(!doneSearching)
{
int middleIndex = CurrentWordList.Count / 2;
//TODO: test for CurrentWordList.count 2 or 1 ...
//TODO: test for wordToCheck.length < curr index
char middleLetter = CurrentWordList[middleIndex][currIndex];
LetterPositionEnum returnEnum = GetLetterPosition(currChar, middleLetter);
switch(returnEnum)
{
case LetterPositionEnum.Before:
CurrentWordList = CurrentWordList.GetRange(middleIndex, (CurrentWordList.Count - middleIndex));
break;
case LetterPositionEnum.PREV:
CurrentWordList = CurrentWordList.GetRange(middleIndex, (CurrentWordList.Count - middleIndex));
break;
case LetterPositionEnum.MATCH:
CurrentWordList = CurrentWordList.GetRange(middleIndex, (CurrentWordList.Count - middleIndex));
break;
case LetterPositionEnum.NEXT:
CurrentWordList = CurrentWordList.GetRange(0, middleIndex);
break;
case LetterPositionEnum.After:
CurrentWordList = CurrentWordList.GetRange(0, middleIndex);
break;
default:
break;
}
}
TrimWords(currChar, currIndex);
//Null check.
if (CurrentWordList == null || CurrentWordList.Count < 1)
{
return false;
}
//There are still words left in CurrentWordList.
return true;
}
//Trim all words in CurrentWordList
//that are LetterPositionEnum.PREV and LetterPositionEnum.NEXT
private static void TrimWords(char currChar, int currIndex)
{
int startIndex = 0;
int endIndex = CurrentWordList.Count;
bool startIndexFound = false;
//Loop through all of the words.
for ( int i = startIndex; i < endIndex; i++)
{
//If we havent found the start index then the first match of currChar
//will be the start index.
if( !startIndexFound && currChar == CurrentWordList[i][currIndex] )
{
startIndex = i;
startIndexFound = true;
}
//If we have found the start index then the next letter that isnt
//currChar will be the end index.
if( startIndexFound && currChar != CurrentWordList[i][currIndex])
{
endIndex = i;
break;
}
}
//Trim the words that dont start with currChar.
CurrentWordList = CurrentWordList.GetRange(startIndex, endIndex);
}
//In order to find all words that begin with a given character, we should search
//for the last word that begins with the previous character (PREV) and the
//first word that begins with the next character (NEXT).
//Anything else Before or After that is trash and we will throw out.
public enum LetterPositionEnum
{
Before,
PREV,
MATCH,
NEXT,
After
};
//We want to ignore all letters that come before this one.
public static LetterPositionEnum GetLetterPosition(char currChar, char compareLetter)
{
switch (currChar)
{
case 'A':
switch (compareLetter)
{
case 'A': return LetterPositionEnum.MATCH;
case 'B': return LetterPositionEnum.NEXT;
case 'C': return LetterPositionEnum.After;
case 'D': return LetterPositionEnum.After;
case 'E': return LetterPositionEnum.After;
case 'F': return LetterPositionEnum.After;
case 'G': return LetterPositionEnum.After;
case 'H': return LetterPositionEnum.After;
case 'I': return LetterPositionEnum.After;
case 'J': return LetterPositionEnum.After;
case 'K': return LetterPositionEnum.After;
case 'L': return LetterPositionEnum.After;
case 'M': return LetterPositionEnum.After;
case 'N': return LetterPositionEnum.After;
case 'O': return LetterPositionEnum.After;
case 'P': return LetterPositionEnum.After;
case 'Q': return LetterPositionEnum.After;
case 'R': return LetterPositionEnum.After;
case 'S': return LetterPositionEnum.After;
case 'T': return LetterPositionEnum.After;
case 'U': return LetterPositionEnum.After;
case 'V': return LetterPositionEnum.After;
case 'W': return LetterPositionEnum.After;
case 'X': return LetterPositionEnum.After;
case 'Y': return LetterPositionEnum.After;
case 'Z': return LetterPositionEnum.After;
default: return LetterPositionEnum.After;
}
case 'B':
switch (compareLetter)
{
case 'A': return LetterPositionEnum.PREV;
case 'B': return LetterPositionEnum.MATCH;
case 'C': return LetterPositionEnum.NEXT;
case 'D': return LetterPositionEnum.After;
case 'E': return LetterPositionEnum.After;
case 'F': return LetterPositionEnum.After;
case 'G': return LetterPositionEnum.After;
case 'H': return LetterPositionEnum.After;
case 'I': return LetterPositionEnum.After;
case 'J': return LetterPositionEnum.After;
case 'K': return LetterPositionEnum.After;
case 'L': return LetterPositionEnum.After;
case 'M': return LetterPositionEnum.After;
case 'N': return LetterPositionEnum.After;
case 'O': return LetterPositionEnum.After;
case 'P': return LetterPositionEnum.After;
case 'Q': return LetterPositionEnum.After;
case 'R': return LetterPositionEnum.After;
case 'S': return LetterPositionEnum.After;
case 'T': return LetterPositionEnum.After;
case 'U': return LetterPositionEnum.After;
case 'V': return LetterPositionEnum.After;
case 'W': return LetterPositionEnum.After;
case 'X': return LetterPositionEnum.After;
case 'Y': return LetterPositionEnum.After;
case 'Z': return LetterPositionEnum.After;
default: return LetterPositionEnum.After;
}
case 'C':
switch (compareLetter)
{
case 'A': return LetterPositionEnum.Before;
case 'B': return LetterPositionEnum.PREV;
case 'C': return LetterPositionEnum.MATCH;
case 'D': return LetterPositionEnum.NEXT;
case 'E': return LetterPositionEnum.After;
case 'F': return LetterPositionEnum.After;
case 'G': return LetterPositionEnum.After;
case 'H': return LetterPositionEnum.After;
case 'I': return LetterPositionEnum.After;
case 'J': return LetterPositionEnum.After;
case 'K': return LetterPositionEnum.After;
case 'L': return LetterPositionEnum.After;
case 'M': return LetterPositionEnum.After;
case 'N': return LetterPositionEnum.After;
case 'O': return LetterPositionEnum.After;
case 'P': return LetterPositionEnum.After;
case 'Q': return LetterPositionEnum.After;
case 'R': return LetterPositionEnum.After;
case 'S': return LetterPositionEnum.After;
case 'T': return LetterPositionEnum.After;
case 'U': return LetterPositionEnum.After;
case 'V': return LetterPositionEnum.After;
case 'W': return LetterPositionEnum.After;
case 'X': return LetterPositionEnum.After;
case 'Y': return LetterPositionEnum.After;
case 'Z': return LetterPositionEnum.After;
default: return LetterPositionEnum.After;
}
//etc. Stack Overflow limits characters to 30,000 contact me for full switch case.
default: return LetterPositionEnum.After;
}
}
}
答案 4 :(得分:0)
好的,这是我提出的最终解决方案,我不确定它是否是最佳优化,但似乎非常快,我喜欢逻辑并喜欢代码的简洁。
基本上在应用程序启动时,您将任意长度的单词列表传递给InitWords。这将对单词进行排序并将它们放入具有26个键的Dicitonary中,每个字母对应一个字母。
然后在播放期间,您将遍历字符集,始终以第一个字母开头,然后是第一个和第二个字母,依此类推。您整个时间都在减少CurrentWordList中的单词数。
所以如果你有字符串'icedgt'。您可以使用'i'调用InitCompare,这将从MyDictionary中获取Key'I'的KeyValuePair,然后您将看到第一个单词的长度是1,因为它们已经按字母顺序排列,单词“I”将是第一个字。然后在下一次迭代中,将'c'传递给NextCompare,这再次通过使用Linq返回具有第二个char'c'的单词来减小List大小。接下来你将做另一个NextCompare并传入'e',再次使用Linq减少CurrentWordList中的单词数。
所以在第一次迭代之后,你的CurrentWordList包含以'i'开头的每个单词,在NextCompare上你将拥有以'ic'开头的每个单词,而在NextCompare上你将拥有每个单词以每个单词开头的单词'冰'等等。
我不确定Linq是否会在速度方面击败我的手动巨型Switch Case,但它简单而优雅。为此,我很高兴。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Xuzzle.Code
{
public class CompareHelper
{
//Should always be sorted in alphabetical order.
public static Dictionary<char, List<string>> MyDictionary;
public static List<string> CurrentWordList;
//The word we are trying to find matches for.
public static char InitChar;
public static StringBuilder ThisWord;
/// <summary>
/// Init MyDictionary with the list of words passed in. Make a new
/// key value pair with each Letter.
/// </summary>
/// <param name="listOfWords"></param>
public static void InitWords(List<string> listOfWords)
{
MyDictionary = new Dictionary<char, List<string>>();
foreach (char currChar in LetterHelper.Alphabet)
{
var wordsParsed = listOfWords.Where(currWord => char.ToUpper(currWord[0]) == currChar).ToArray();
Array.Sort(wordsParsed);
MyDictionary.Add(currChar, wordsParsed.ToList());
}
}
/// <summary>
/// Initialize the Compare. Set the first character. See if there are any 1 letter words
/// for that character.
/// </summary>
/// <param name="firstChar">The first character in the word string.</param>
/// <returns>True if a word was found.</returns>
public static bool InitCompare(char firstChar)
{
InitChar = firstChar;
//Get all words that start with the firstChar.
CurrentWordList = MyDictionary[InitChar];
ThisWord = new StringBuilder();
ThisWord.Append(firstChar);
if (CurrentWordList[0].Length == 1)
{
//Match.
return true;
}
//No matches.
return false;
}
/// <summary>
/// Append this letter to our ThisWord. See if there are any matching words.
/// </summary>
/// <param name="nextChar">The next character in the word string.</param>
/// <returns>True if a word was found.</returns>
public static bool NextCompare(char nextChar)
{
ThisWord.Append(nextChar);
int currentIndex = ThisWord.Length - 1;
if (CurrentWordList != null && CurrentWordList.Count > 0)
{
CurrentWordList = CurrentWordList.Where(word => (word.Length > currentIndex && word[currentIndex] == nextChar)).ToList();
if (CurrentWordList != null && CurrentWordList.Count > 0)
{
if (CurrentWordList[0].Length == ThisWord.Length)
{
//Match.
return true;
}
}
}
//No matches.
return false;
}
}
}