我想要一种有效的算法来查找更大序列中所有出现的模式。
例如,给出以下输入:
模式: GAS
序列: ASDFGASDFGASDFADFASDFGA
预期输出: {4, 9}
根据接受的answer到similar question实现了用于实现所需任务的算法。但是,有一个comment报告该算法“在大字节数组上很慢”。
在阅读之后,似乎最好的算法是Boyer-Moore String search algrorithm在CodeProject的C#中实现,但我在为通用枚举实现它时遇到了麻烦。
是否存在基于Boyer-Moore算法的现有解决方案,以便在.NET中查找泛型序列中所有模式的出现?
注意
虽然我在我的例子中使用了字符串,但我想要一个适用于任何实现IEnumerable的数据的答案。换句话说,它不仅应该在字符串上工作,而且应该在任何类型上工作。
答案 0 :(得分:3)
最坏的情况是O(nm)(其中n = seq.Count),当序列是模式的重复时,模式是重复m次的另一种模式(如果我错了,请纠正我)。
List<int> LookFor<T>( IEnumerable<T> seq, T[ ] pattern )
where T : IEquatable<T> {
var partialMatches = new LinkedList<int>( );
var matches = new List<int>( );
int i = 0;
foreach ( T item in seq ) {
if ( item.Equals( pattern[ 0 ] ) )
partialMatches.AddLast( 0 );
var n = partialMatches.First;
while(n != null) {
if ( item.Equals( pattern[ n.Value ] ) ) {
n.Value += 1;
if ( n.Value == pattern.Length ) {
matches.Add( i - pattern.Length + 1 );
var next = n.Next;
partialMatches.Remove( n );
n = next;
continue;
}
}
else partialMatches.Remove( n );
n = n.Next;
}
i += 1;
}
return matches;
}
测试:
void Main()
{
var matches = LookFor( "abcabcabcabcabcabcabc",
new char[ ] { 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c' } );
foreach ( var x in matches )
Console.WriteLine( "{0}", x );
}
输出:
0
3
6
9
12
答案 1 :(得分:0)
在徒劳地理解Boyer-Moore算法之后,我把这个代码放在一起,通过更大的集合进行单次传递。
我无法针对Boyer-Moore算法进行测试,但是当整个序列重复模式时, O(nm)作为最差情况下的效果非常有效
这是我的实施。让我知道你对它的看法。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Enter the string you want to search within.");
string hayStack = Console.ReadLine();
Console.WriteLine("Enter the string you want to search for.");
string needle = Console.ReadLine();
var ps = new PatternSearch<char>(needle.ToCharArray());
Console.WriteLine();
Console.WriteLine();
Console.WriteLine(hayStack);
var matches = ps.Matches(hayStack.ToCharArray()).ToList();
for (int i = 0; i < hayStack.Length; i++)
Console.Write(matches.Contains(i) ? "↑" : " ");
Console.WriteLine();
Console.ReadLine();
}
}
/// <summary>Implements a pattern searching algorithm with <b>O(nm)</b> worst-case performance.</summary>
/// <typeparam name="T">The data type of the array to search.</typeparam>
public class PatternSearch<T>
{
private struct MatchInfo
{
public MatchInfo(int startIndex, int matchLength)
{
this.StartIndex = startIndex;
this.MatchLength = matchLength;
}
public int StartIndex;
public int MatchLength;
}
private IEnumerable<T> pattern;
private List<MatchInfo> found;
private Func<T, T, bool> eqComp;
//optimization for IEnumerables that do not implement IList
int patLen = -1;
int seqLen = -1;
/// <summary>Initializes a new instance of the <see cref="PatternSearch{T}" /> class.</summary>
/// <param name="pattern">The pattern that will be searched for.</param>
public PatternSearch(T[] pattern) : this(pattern, (x, y) => x.Equals(y)) { }
/// <summary>
/// Initializes a new instance of the <see cref="PatternSearch{T}"/> class with the specified equality comparer.
/// </summary>
/// <param name="pattern">The pattern to be searched for.</param>
/// <param name="equalityComparer">The equality comparer to use for matching elements in the array.</param>
public PatternSearch(T[] pattern, Func<T, T, bool> equalityComparer)
{
patLen = pattern.Length;
if (pattern == null)
throw new ArgumentNullException("pattern", "The search pattern cannot be null.");
if (equalityComparer == null)
throw new ArgumentNullException("equalityComparer", "The equality comparer cannot be null.");
if (patLen <= 0)
throw new ArgumentException("pattern", "The pattern cannot be empty.");
// assign the values
this.pattern = pattern;
found = new List<MatchInfo>();
eqComp = equalityComparer;
}
/// <summary>
/// Returns the start index of all occurrences of the search pattern within the specified array.
/// </summary>
/// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
public IEnumerable<int> Matches(IEnumerable<T> seq)
{
seqLen = seqLen == -1 ? seq.Count() : seqLen;
return this.Matches(seq, 0, seqLen);
}
/// <summary>
/// Returns the start index of all occurrences of the search pattern within the specified array.
/// </summary>
/// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
/// <param name="startIndex">The index in <paramref name="seq"/> to start searching at.</param>
public IEnumerable<int> Matches(IEnumerable<T> seq, int startIndex)
{
seqLen = seqLen == -1 ? seq.Count() : seqLen;
return this.Matches(seq, startIndex, seqLen);
}
/// <summary>
/// Returns the start index of all occurrences of the search pattern within the specified array.
/// </summary>
/// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
/// <param name="count">The maximum number of items in <paramref name="seq"/> to match.</param>
public IEnumerable<int> Matches(IEnumerable<T> seq, int startIndex, int count)
{
patLen = patLen == -1 ? pattern.Count() : patLen;
seqLen = seqLen == -1 ? seq.Count() : seqLen;
bool addedNew = false;
var endPoint = Math.Min(seqLen, startIndex + count);
if (seq == null || // sequence cannot be null
seqLen < patLen || // pattern cannot be longer than sequence
(endPoint - startIndex) < patLen) // start to end cannot be less than pattern
yield break;
for (int i = startIndex; i < endPoint; i++)
{
addedNew = false;
// add the first item if a match is found
if (eqComp(seq.ElementAt(i), pattern.ElementAt(0)))
{
if (patLen == 1)
yield return i;
found.Add(new MatchInfo(i, 1));
addedNew = true;
}
// check incomplete matches
for (int m = found.Count - 1; m >= 0; m--)
{
//skip the last item added
if (addedNew && m == found.Count - 1)
continue;
var match = found[m];
// check incomplete matches
if ((i - match.StartIndex < patLen) &&
eqComp(seq.ElementAt(i), pattern.ElementAt(match.MatchLength)))
{
match.MatchLength += 1;
found[m] = match;
// determine if a complete match has been found
if (match.MatchLength == patLen)
{
yield return match.StartIndex;
found.RemoveAt(m);
}
}
else
found.RemoveAt(m);
}
}
}
}
}