如何在两个大字符串之间找到lcs长度

时间:2014-02-15 12:34:25

标签: c# .net text lcs

我在C#中编写了以下代码,用于获取由use给出的两个文本的最长公共子序列的长度,但它不适用于大字符串。请你帮助我好吗。我真的很困惑。

public Form1()
{
    InitializeComponent();
}

public int lcs(char[] s1, char[] s2, int s1size, int s2size)
{
    if (s1size == 0 || s2size == 0)
    {
        return 0;
    }
    else
    {
        if (s1[s1size - 1] == s2[s2size - 1])
        {
            return (lcs(s1, s2, s1size - 1, s2size - 1) + 1);
        }
        else
        {
            int x = lcs(s1, s2, s1size, s2size - 1);
            int y = lcs(s1, s2, s1size - 1, s2size);
            if (x > y)
            {
                return x;
            }
            else
                return y;
        }
    }
}

private void button1_Click(object sender, EventArgs e)
{
    string st1 = textBox2.Text.Trim(' ');
    string st2 = textBox3.Text.Trim(' ');

    char[] a = st1.ToCharArray();
    char[] b = st2.ToCharArray();

    int s1 = a.Length;
    int s2 = b.Length;

    textBox1.Text = lcs(a, b, s1, s2).ToString(); 
}

4 个答案:

答案 0 :(得分:8)

这里使用的是递归方法。因此,如您所述,它会导致程序出现性能问题。

使用动态编程方法,而不是递归。

这是C#代码。

public static void LCS(char[] str1, char[] str2)
    {
        int[,] l = new int[str1.Length, str2.Length];
        int lcs = -1;
        string substr = string.Empty;
        int end = -1;

        for (int i = 0; i < str1.Length; i++)
        {
            for (int j = 0; j < str2.Length; j++)
            {
                if (str1[i] == str2[j])
                {
                    if (i == 0 || j == 0)
                    {
                        l[i, j] = 1;
                    }
                    else
                        l[i, j] = l[i - 1, j - 1] + 1;
                    if (l[i, j] > lcs)
                    {
                        lcs = l[i, j];
                        end = i;
                    }

                }
                else
                    l[i, j] = 0;
            }
        }

        for (int i = end - lcs + 1; i <= end; i++)
        {
            substr += str1[i];
        }

        Console.WriteLine("Longest Common SubString Length = {0}, Longest Common Substring = {1}", lcs, substr);
    } 

答案 1 :(得分:1)

以下是如何在C#中找到最长公共子字符串的解决方案:

public static string GetLongestCommonSubstring(params string[] strings)
{
    var commonSubstrings = new HashSet<string>(strings[0].GetSubstrings());
    foreach (string str in strings.Skip(1))
    {
        commonSubstrings.IntersectWith(str.GetSubstrings());
        if (commonSubstrings.Count == 0)
            return string.Empty;
    }

    return commonSubstrings.OrderByDescending(s => s.Length).DefaultIfEmpty(string.Empty).First();
}

private static IEnumerable<string> GetSubstrings(this string str)
{
    for (int c = 0; c < str.Length - 1; c++)
    {
        for (int cc = 1; c + cc <= str.Length; cc++)
        {
            yield return str.Substring(c, cc);
        }
    }
}

我在这里找到了它:http://www.snippetsource.net/Snippet/75/longest-common-substring

答案 2 :(得分:1)

只是为了好玩,这是一个使用Queue<T>的示例:

string LongestCommonSubstring(params string[] strings)
{
    return LongestCommonSubstring(strings[0], new Queue<string>(strings.Skip(1)));
}

string LongestCommonSubstring(string x, Queue<string> strings)
{
    if (!strings.TryDequeue(out var y))
    {
        return x;
    }

    var output = string.Empty;

    for (int i = 0; i < x.Length; i++)
    {
        for (int j = x.Length - i; j > -1; j--)
        {
            string common = x.Substring(i, j);
            if (y.IndexOf(common) > -1 && common.Length > output.Length) output = common;
        }
    }

    return LongestCommonSubstring(output, strings);
}

尽管它仍在使用递归,但这是Queue<T>的一个很好的例子。

答案 3 :(得分:0)

我从https://iq.opengenus.org/longest-common-substring-using-rolling-hash/的Ashutosh Singh重构了C ++代码,以在C#中创建滚动哈希方法-这将在O(N * log(N)^ 2)时间和O(N)空间中找到子字符串

using System;
using System.Collections.Generic;
public class RollingHash
{
    private class RollingHashPowers
    {
        // _mod = prime modulus of polynomial hashing
        // any prime number over a billion should suffice
        internal const int _mod = (int)1e9 + 123;
        // _hashBase = base (point of hashing)
        // this should be a prime number larger than the number of characters used
        // in my use case I am only interested in ASCII (256) characters
        // for strings in languages using non-latin characters, this should be much larger
        internal const long _hashBase = 257;
        // _pow1 = powers of base modulo mod
        internal readonly List<int> _pow1 = new List<int> { 1 };
        // _pow2 = powers of base modulo 2^64
        internal readonly List<long> _pow2 = new List<long> { 1L };

        internal void EnsureLength(int length)
        {
            if (_pow1.Capacity < length)
            {
                _pow1.Capacity = _pow2.Capacity = length;
            }
            for (int currentIndx = _pow1.Count - 1; currentIndx < length; ++currentIndx)
            {
                _pow1.Add((int)(_pow1[currentIndx] * _hashBase % _mod));
                _pow2.Add(_pow2[currentIndx] * _hashBase);
            }
        }
    }

    private class RollingHashedString
    {
        readonly RollingHashPowers _pows;
        readonly int[] _pref1; // Hash on prefix modulo mod
        readonly long[] _pref2; // Hash on prefix modulo 2^64

        // Constructor from string:
        internal RollingHashedString(RollingHashPowers pows, string s, bool caseInsensitive = false)
        {
            _pows = pows;
            _pref1 = new int[s.Length + 1];
            _pref2 = new long[s.Length + 1];

            const long capAVal = 'A';
            const long capZVal = 'Z';
            const long aADif = 'a' - 'A';

            unsafe
            {
                fixed (char* c = s)
                {
                    // Fill arrays with polynomial hashes on prefix
                    for (int i = 0; i < s.Length; ++i)
                    {
                        long v = c[i];
                        if (caseInsensitive && capAVal <= v && v <= capZVal)
                        {
                            v += aADif;
                        }
                        _pref1[i + 1] = (int)((_pref1[i] + v * _pows._pow1[i]) % RollingHashPowers._mod);
                        _pref2[i + 1] = _pref2[i] + v * _pows._pow2[i];
                    }
                }
            }
        }

        // Rollingnomial hash of subsequence [pos, pos+len)
        // If mxPow != 0, value automatically multiply on base in needed power.
        // Finally base ^ mxPow
        internal Tuple<int, long> Apply(int pos, int len, int mxPow = 0)
        {
            int hash1 = _pref1[pos + len] - _pref1[pos];
            long hash2 = _pref2[pos + len] - _pref2[pos];
            if (hash1 < 0)
            {
                hash1 += RollingHashPowers._mod;
            }
            if (mxPow != 0)
            {
                hash1 = (int)((long)hash1 * _pows._pow1[mxPow - (pos + len - 1)] % RollingHashPowers._mod);
                hash2 *= _pows._pow2[mxPow - (pos + len - 1)];
            }
            return Tuple.Create(hash1, hash2);
        }
    }

    private readonly RollingHashPowers _rhp;
    public RollingHash(int longestLength = 0)
    {
        _rhp = new RollingHashPowers();
        if (longestLength > 0)
        {
            _rhp.EnsureLength(longestLength);
        }
    }

    public string FindCommonSubstring(string a, string b, bool caseInsensitive = false)
    {
        // Calculate max neede power of base:
        int mxPow = Math.Max(a.Length, b.Length);
        _rhp.EnsureLength(mxPow);
        // Create hashing objects from strings:
        RollingHashedString hash_a = new RollingHashedString(_rhp, a, caseInsensitive);
        RollingHashedString hash_b = new RollingHashedString(_rhp, b, caseInsensitive);

        // Binary search by length of same subsequence:
        int pos = -1;
        int low = 0;
        int minLen = Math.Min(a.Length, b.Length);
        int high = minLen + 1;
        var tupleCompare = Comparer<Tuple<int, long>>.Default;
        while (high - low > 1)
        {
            int mid = (low + high) / 2;
            List<Tuple<int, long>> hashes = new List<Tuple<int, long>>(a.Length - mid + 1);
            for (int i = 0; i + mid <= a.Length; ++i)
            {
                hashes.Add(hash_a.Apply(i, mid, mxPow));
            }
            hashes.Sort(tupleCompare);
            int p = -1;
            for (int i = 0; i + mid <= b.Length; ++i)
            {
                if (hashes.BinarySearch(hash_b.Apply(i, mid, mxPow), tupleCompare) >= 0)
                {
                    p = i;
                    break;
                }
            }
            if (p >= 0)
            {
                low = mid;
                pos = p;
            }
            else
            {
                high = mid;
            }
        }
        // Output answer:
        return pos >= 0
            ? b.Substring(pos, low)
            : string.Empty;
    }
}