在较大的序列中查找所有出现的模式

时间:2012-09-26 14:28:47

标签: c# .net pattern-matching

我想要一种有效的算法来查找更大序列中所有出现的模式。

例如,给出以下输入:
模式: GAS
序列: ASDFGASDFGASDFADFASDFGA

预期输出: {4, 9}

根据接受的answersimilar question实现了用于实现所需任务的算法。但是,有一个comment报告该算法“在大字节数组上很慢”。

在阅读之后,似乎最好的算法是Boyer-Moore String search algrorithmCodeProject的C#中实现,但我在为通用枚举实现它时遇到了麻烦。

是否存在基于Boyer-Moore算法的现有解决方案,以便在.NET中查找泛型序列中所有模式的出现?

注意
虽然我在我的例子中使用了字符串,但我想要一个适用于任何实现IEnumerable的数据的答案。换句话说,它不仅应该在字符串上工作,而且应该在任何类型上工作。

2 个答案:

答案 0 :(得分:3)

最坏的情况是O(nm)(其中n = seq.Count),当序列是模式的重复时,模式是重复m次的另一种模式(如果我错了,请纠正我)。

List<int> LookFor<T>( IEnumerable<T> seq, T[ ] pattern )
        where T : IEquatable<T> {

    var partialMatches = new LinkedList<int>( );
    var matches = new List<int>( );

    int i = 0;
    foreach ( T item in seq ) {
        if ( item.Equals( pattern[ 0 ] ) )
            partialMatches.AddLast( 0 );

        var n = partialMatches.First;
        while(n != null) {
            if ( item.Equals( pattern[ n.Value ] ) ) {
                n.Value += 1;
                if ( n.Value == pattern.Length ) {
                    matches.Add( i - pattern.Length + 1 );

                    var next = n.Next;
                    partialMatches.Remove( n );
                    n = next;

                    continue;
                }
            }
            else partialMatches.Remove( n );

            n = n.Next;
        }

        i += 1;
    }

    return matches;
}

测试:

void Main()
{
    var matches = LookFor( "abcabcabcabcabcabcabc", 
        new char[ ] { 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c' } );

    foreach ( var x in matches )
        Console.WriteLine( "{0}", x );
}

输出:

0
3
6
9
12

答案 1 :(得分:0)

在徒劳地理解Boyer-Moore算法之后,我把这个代码放在一起,通过更大的集合进行单次传递。

我无法针对Boyer-Moore算法进行测试,但是当整个序列重复模式时, O(nm)作为最差情况下的效果非常有效

这是我的实施。让我知道你对它的看法。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine("Enter the string you want to search within.");
            string hayStack = Console.ReadLine();
            Console.WriteLine("Enter the string you want to search for.");
            string needle = Console.ReadLine();

            var ps = new PatternSearch<char>(needle.ToCharArray());

            Console.WriteLine();
            Console.WriteLine();

            Console.WriteLine(hayStack);

            var matches = ps.Matches(hayStack.ToCharArray()).ToList();

            for (int i = 0; i < hayStack.Length; i++)
                Console.Write(matches.Contains(i) ? "↑" : " ");

            Console.WriteLine();

            Console.ReadLine();
        }
    }

    /// <summary>Implements a pattern searching algorithm with <b>O(nm)</b> worst-case performance.</summary>
    /// <typeparam name="T">The data type of the array to search.</typeparam>
    public class PatternSearch<T>
    {
        private struct MatchInfo
        {
            public MatchInfo(int startIndex, int matchLength)
            {
                this.StartIndex = startIndex;
                this.MatchLength = matchLength;
            }
            public int StartIndex;
            public int MatchLength;
        }

        private IEnumerable<T> pattern;
        private List<MatchInfo> found;
        private Func<T, T, bool> eqComp;

        //optimization for IEnumerables that do not implement IList
        int patLen = -1;
        int seqLen = -1;

        /// <summary>Initializes a new instance of the <see cref="PatternSearch{T}" /> class.</summary>
        /// <param name="pattern">The pattern that will be searched for.</param>
        public PatternSearch(T[] pattern) : this(pattern, (x, y) => x.Equals(y)) { }

        /// <summary>
        /// Initializes a new instance of the <see cref="PatternSearch{T}"/> class with the specified equality comparer.
        /// </summary>
        /// <param name="pattern">The pattern to be searched for.</param>
        /// <param name="equalityComparer">The equality comparer to use for matching elements in the array.</param>
        public PatternSearch(T[] pattern, Func<T, T, bool> equalityComparer)
        {
            patLen = pattern.Length;

            if (pattern == null)
                throw new ArgumentNullException("pattern", "The search pattern cannot be null.");
            if (equalityComparer == null)
                throw new ArgumentNullException("equalityComparer", "The equality comparer cannot be null.");

            if (patLen <= 0)
                throw new ArgumentException("pattern", "The pattern cannot be empty.");

            // assign the values
            this.pattern = pattern;
            found = new List<MatchInfo>();
            eqComp = equalityComparer;
        }

        /// <summary>
        /// Returns the start index of all occurrences of the search pattern within the specified array.
        /// </summary>
        /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
        public IEnumerable<int> Matches(IEnumerable<T> seq)
        {
            seqLen = seqLen == -1 ? seq.Count() : seqLen;
            return this.Matches(seq, 0, seqLen);
        }

        /// <summary>
        /// Returns the start index of all occurrences of the search pattern within the specified array.
        /// </summary>
        /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
        /// <param name="startIndex">The index in <paramref name="seq"/> to start searching at.</param>
        public IEnumerable<int> Matches(IEnumerable<T> seq, int startIndex)
        {
            seqLen = seqLen == -1 ? seq.Count() : seqLen;
            return this.Matches(seq, startIndex, seqLen);
        }

        /// <summary>
        /// Returns the start index of all occurrences of the search pattern within the specified array.
        /// </summary>
        /// <param name="seq">The larger sequence to find occurrences of the search pattern within.</param>
        /// <param name="count">The maximum number of items in <paramref name="seq"/> to match.</param>
        public IEnumerable<int> Matches(IEnumerable<T> seq, int startIndex, int count)
        {
            patLen = patLen == -1 ? pattern.Count() : patLen;
            seqLen = seqLen == -1 ? seq.Count() : seqLen;
            bool addedNew = false;

            var endPoint = Math.Min(seqLen, startIndex + count);

            if (seq == null ||                      // sequence cannot be null
                seqLen < patLen ||                  // pattern cannot be longer than sequence
                (endPoint - startIndex) < patLen)   // start to end cannot be less than pattern
                yield break;

            for (int i = startIndex; i < endPoint; i++)
            {
                addedNew = false;

                // add the first item if a match is found
                if (eqComp(seq.ElementAt(i), pattern.ElementAt(0)))
                {
                    if (patLen == 1)
                        yield return i;

                    found.Add(new MatchInfo(i, 1));
                    addedNew = true;
                }

                // check incomplete matches
                for (int m = found.Count - 1; m >= 0; m--)
                {
                    //skip the last item added
                    if (addedNew && m == found.Count - 1)
                        continue;

                    var match = found[m];

                    // check incomplete matches
                    if ((i - match.StartIndex < patLen) &&
                        eqComp(seq.ElementAt(i), pattern.ElementAt(match.MatchLength)))
                    {
                        match.MatchLength += 1;
                        found[m] = match;

                        // determine if a complete match has been found
                        if (match.MatchLength == patLen)
                        {
                            yield return match.StartIndex;
                            found.RemoveAt(m);
                        }
                    }
                    else
                        found.RemoveAt(m);
                }
            }
        }

    }
}