我正在尝试在大量文本中实现精确的文本搜索。为此我找到了一些Boyer Moore实现c#的例子,但现在我无法理解它是如何工作的。
例如,如果我有字符串这是要搜索的示例文字,并且想要搜索 它有效,但是如果我将搜索模式更改为以搜索和发送文本,它仍会返回值而不是-1。为什么会这样?在我的搜索文本中没有要搜索的模式字符串和文本。
下面的是我通过Stackoverflow找到的实现
public class BoyerMooreStringSearching
{
readonly Dictionary<char, LastIndexTable> _lastIndexTable = new Dictionary<char, LastIndexTable>();
public string PatternToSearch;
public List<int> GetStartingIndexsOfPatternInText(string textToSearchIn, string patternToSearch)
{
var list = new List<int>();
PatternToSearch = patternToSearch;
if (patternToSearch != null && !string.IsNullOrEmpty(textToSearchIn))
{
UpdateLastIndexTable(patternToSearch);
PatternToSearch = patternToSearch;
var j = patternToSearch.Length - 1;
// Main loop to iterate over whole text
while (j <= textToSearchIn.Length - 1)
{
var lastCharOfPattern = patternToSearch[patternToSearch.Length - 1];
if (textToSearchIn[j] != lastCharOfPattern)
{
// Heuristic 1
// If Last Char is not matched with the Last char in pattern and char is not present in the pattern
// Then advance pointer 'j' to the length of the pattern in textToSearch.
if (!_lastIndexTable.ContainsKey(textToSearchIn[j]))
{
j += patternToSearch.Length - 1;
}
// Heuristic 2
// Consult the lastIndex table to get the last index of current char in textToSearch
// and advance pointer 'j' to the last index in textToSearch.
if (j <= textToSearchIn.Length - 1 && _lastIndexTable.ContainsKey(textToSearchIn[j]))
{
var tempObj = _lastIndexTable[textToSearchIn[j]];
if (tempObj != null) j += tempObj.LastIndex;
}
}
int k = patternToSearch.Length - 1;
int u = j;
if (j <= textToSearchIn.Length - 1)
{
while (k >= 0)
{
// Heuristic (3a)
// If Last Char is matched with the Last char in pattern then back track in the text and pattern till
// either you got a complete match or a mismatched charecter.
// Once you got the mismatched char and mismatched char is not present in the pattern then
// advance j to the index of mismatched charecter in the pattern
if (textToSearchIn[u] == patternToSearch[k])
{
if (k == 0 && textToSearchIn[u] == patternToSearch[k])
{
list.Add(u);
j += patternToSearch.Length - 1;
}
u--;
k--;
continue;
}
if (!_lastIndexTable.ContainsKey(textToSearchIn[u]))
{
// Heuristic (3b)
// If Last Char is matched with the Last char in pattern then back track in the text till
// either you got a complete match or a mismatched charecter.
// Once you got the mismatched char and mismatched char is not present in the pattern then
// advance j to the index of mismatched charecter in the pattern plus the number to char which matched.
j += k + (j - u);
break;
}
k--;
}
}
j++;
}
}
if (!list.Any())
list.Add(-1);
return list;
}
private void UpdateLastIndexTable(string patternToSearch)
{
_lastIndexTable.Clear();
var i = patternToSearch.Length - 1;
foreach (var charToSeach in patternToSearch)
{
if (_lastIndexTable.ContainsKey(charToSeach))
{
_lastIndexTable[charToSeach].LastIndex = i;
}
else
{
_lastIndexTable.Add(charToSeach, new LastIndexTable
{
CharSearched = charToSeach,
LastIndex = i
});
}
i--;
}
}
private class LastIndexTable
{
public char CharSearched { get; set; }
public int LastIndex { get; set; }
}
}
以下是它的示例用法。
var each = "this is sample text to search for";
var result = new BoyerMooreStringSearching().GetStartingIndexsOfPatternInText(each, "to search for and text");
答案 0 :(得分:1)
当回溯(启发式3a)时,你会一直在搜索字符串中找到字符,直到结束。您需要额外检查:
if (k == 0 && textToSearchIn[u] == patternToSearch[k])
{
if (u + patternToSearch.Length <= textToSearchIn.Length)
list.Add(u);
j += patternToSearch.Length - 1;
}