两个字符串之间的所有公共子串

时间:2013-06-11 22:35:52

标签: c#

我正在使用C#来查找两个字符串之间的所有常见子字符串。例如,如果输入是: S1 =“需要电子邮件协商” S2 =“需要电子邮件协助”

输出应该是“需要帮助电子邮件”

下面的代码返回最长的公共子字符串,但我希望我的代码返回所有常见的子字符串。任何帮助深表感谢!

static void commonsubstrings()
    {
        input1 = "need asssitance with email";
        input2 = "email assistance needed"

        if (input2.Length > input1.Length)
        {
            swap = input1;
            input1 = input2;
            input2 = swap;
        }

        int k = 1;
        String temp;
        String longTemp = "";
        for (int i = 0; (i <= input1.Length); i++)
        {
            if ((i == input1.Length))
            {
                if (longest != null)
                {
                    k = longest.Length + 1;
                }
                else
                {
                    k = 1;
                }
                temp = input1.Substring(1, input1.Length - 1);
                if (temp.Equals(""))
                {
                    break;
                }
                if (k <= temp.Length)
                {
                    i = k - 1;
                    input1 = temp;
                    if ((longest != null) && (longest.Length > longTemp.Length))
                    {
                        longTemp = longest;
                    }
                }
            }
            holder1 = input1.Substring(0, k);

            for (int j = 0; (j < input2.Length) && (j + k <= input2.Length); j++)
            {
                check1 = input2.Substring(j, k);
                if (holder1.Equals(check1))
                {
                    longest = holder1;
                    break;
                }
            }
            k++;
        }

        Console.WriteLine(longest);
        Console.ReadLine(); 

}

3 个答案:

答案 0 :(得分:4)

另一种方法:您可以使用levenshtein distance来查找两个单词的相似度。如果距离小于指定值,您可以认为两个字符串相等。然后你可以使用levenshtein比较器Enumerable.Intersect

然后很简单:

string S1= "need asssitance with email" ;
string S2 = "email assistance needed";
string[] words1 = S1.Split();
string[] words2 = S2.Split();
var wordsIntersecting = words1.Intersect(words2, new LevenshteinComparer());
string output = string.Join(" ", wordsIntersecting);

输出:need asssitance email

这是自定义比较器:

class LevenshteinComparer : IEqualityComparer<string>
{
    public int MaxDistance { get; set; }
    private Levenshtein _Levenshtein = new Levenshtein();

    public LevenshteinComparer() : this(50) { }

    public LevenshteinComparer(int maxDistance)
    {
        this.MaxDistance = maxDistance;
    }

    public bool Equals(string x, string y)
    {
        int distance = _Levenshtein.iLD(x, y);
        return distance <= MaxDistance;
    }

    public int GetHashCode(string obj)
    {
        return 0; 
    }
}

这是levenshtein算法的实现:

public class Levenshtein
{
    ///*****************************
    /// Compute Levenshtein distance 
    /// Memory efficient version
    ///*****************************
    public int iLD(String sRow, String sCol)
    {
        int RowLen = sRow.Length;  // length of sRow
        int ColLen = sCol.Length;  // length of sCol
        int RowIdx;                // iterates through sRow
        int ColIdx;                // iterates through sCol
        char Row_i;                // ith character of sRow
        char Col_j;                // jth character of sCol
        int cost;                   // cost

        /// Test string length
        if (Math.Max(sRow.Length, sCol.Length) > Math.Pow(2, 31))
            throw (new Exception("\nMaximum string length in Levenshtein.iLD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sRow.Length, sCol.Length) + "."));

        // Step 1

        if (RowLen == 0)
        {
            return ColLen;
        }

        if (ColLen == 0)
        {
            return RowLen;
        }

        /// Create the two vectors
        int[] v0 = new int[RowLen + 1];
        int[] v1 = new int[RowLen + 1];
        int[] vTmp;



        /// Step 2
        /// Initialize the first vector
        for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
        {
            v0[RowIdx] = RowIdx;
        }

        // Step 3

        /// Fore each column
        for (ColIdx = 1; ColIdx <= ColLen; ColIdx++)
        {
            /// Set the 0'th element to the column number
            v1[0] = ColIdx;

            Col_j = sCol[ColIdx - 1];


            // Step 4

            /// Fore each row
            for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
            {
                Row_i = sRow[RowIdx - 1];


                // Step 5

                if (Row_i == Col_j)
                {
                    cost = 0;
                }
                else
                {
                    cost = 1;
                }

                // Step 6

                /// Find minimum
                int m_min = v0[RowIdx] + 1;
                int b = v1[RowIdx - 1] + 1;
                int c = v0[RowIdx - 1] + cost;

                if (b < m_min)
                {
                    m_min = b;
                }
                if (c < m_min)
                {
                    m_min = c;
                }

                v1[RowIdx] = m_min;
            }

            /// Swap the vectors
            vTmp = v0;
            v0 = v1;
            v1 = vTmp;

        }


        // Step 7

        /// Value between 0 - 100
        /// 0==perfect match 100==totaly different
        /// 
        /// The vectors where swaped one last time at the end of the last loop,
        /// that is why the result is now in v0 rather than in v1
        //System.Console.WriteLine("iDist=" + v0[RowLen]);
        int max = System.Math.Max(RowLen, ColLen);
        return ((100 * v0[RowLen]) / max);
    }





    ///*****************************
    /// Compute the min
    ///*****************************

    private int Minimum(int a, int b, int c)
    {
        int mi = a;

        if (b < mi)
        {
            mi = b;
        }
        if (c < mi)
        {
            mi = c;
        }

        return mi;
    }

    ///*****************************
    /// Compute Levenshtein distance         
    ///*****************************

    public int LD(String sNew, String sOld)
    {
        int[,] matrix;              // matrix
        int sNewLen = sNew.Length;  // length of sNew
        int sOldLen = sOld.Length;  // length of sOld
        int sNewIdx; // iterates through sNew
        int sOldIdx; // iterates through sOld
        char sNew_i; // ith character of sNew
        char sOld_j; // jth character of sOld
        int cost; // cost

        /// Test string length
        if (Math.Max(sNew.Length, sOld.Length) > Math.Pow(2, 31))
            throw (new Exception("\nMaximum string length in Levenshtein.LD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sNew.Length, sOld.Length) + "."));

        // Step 1

        if (sNewLen == 0)
        {
            return sOldLen;
        }

        if (sOldLen == 0)
        {
            return sNewLen;
        }

        matrix = new int[sNewLen + 1, sOldLen + 1];

        // Step 2

        for (sNewIdx = 0; sNewIdx <= sNewLen; sNewIdx++)
        {
            matrix[sNewIdx, 0] = sNewIdx;
        }

        for (sOldIdx = 0; sOldIdx <= sOldLen; sOldIdx++)
        {
            matrix[0, sOldIdx] = sOldIdx;
        }

        // Step 3

        for (sNewIdx = 1; sNewIdx <= sNewLen; sNewIdx++)
        {
            sNew_i = sNew[sNewIdx - 1];

            // Step 4

            for (sOldIdx = 1; sOldIdx <= sOldLen; sOldIdx++)
            {
                sOld_j = sOld[sOldIdx - 1];

                // Step 5

                if (sNew_i == sOld_j)
                {
                    cost = 0;
                }
                else
                {
                    cost = 1;
                }

                // Step 6

                matrix[sNewIdx, sOldIdx] = Minimum(matrix[sNewIdx - 1, sOldIdx] + 1, matrix[sNewIdx, sOldIdx - 1] + 1, matrix[sNewIdx - 1, sOldIdx - 1] + cost);

            }
        }

        // Step 7

        /// Value between 0 - 100
        /// 0==perfect match 100==totaly different
        System.Console.WriteLine("Dist=" + matrix[sNewLen, sOldLen]);
        int max = System.Math.Max(sNewLen, sOldLen);
        return (100 * matrix[sNewLen, sOldLen]) / max;
    }
}

对Levenshtein课程的信任:http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm

答案 1 :(得分:3)

    public static string [] CommonString(string left, string right)
    {
        List<string> result = new List<string>();
        string [] rightArray = right.Split();
        string [] leftArray = left.Split();

        result.AddRange(rightArray.Where(r => leftArray.Any(l => l.StartsWith(r))));

        // must check other way in case left array contains smaller words than right array
        result.AddRange(leftArray.Where(l => rightArray.Any(r => r.StartsWith(l))));

        return result.Distinct().ToArray();
    }

答案 2 :(得分:1)

使用Set-Intersections

从例程开始,查找字符串的所有可能的子字符串。这是在Python中,它是“读者练习”将其翻译为C#':

def allSubstr(instring):
  retset = set()
  retset.add(instring)
  totlen = len(instring)
  for thislen in range(0, totlen):
    for startpos in range(0, totlen):
      # print "startpos: %s, thislen: %s" % (startpos, thislen)
      addStr = instring[startpos:startpos+thislen]
      # print "addstr: %s" % (addStr)
      retset.add(addStr)
  print "retset total: %s" % (retset)
  return retset

set1 = allSubstr('abcdefg')
set2 = allSubstr('cdef')
print set1.intersection(set2)

这是输出:

>>> set1 = allSubstr('abcdefg')
retset total: set(['', 'cde', 'ab', 'ef', 'cd', 'abcdef', 'abc', 'efg', 'bcde', 'cdefg', 'bc', 'de',   'bcdef', 'abcd', 'defg', 'fg', 'cdef', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'bcd', 'abcde', 'abcdefg', 'bcdefg', 'def'])
>>> set2 = allSubstr('cdef')
retset total: set(['', 'cde', 'c', 'ef', 'e', 'd', 'f', 'de', 'cd', 'cdef', 'def'])
>>> 
>>> set1.intersection(set2)
set(['', 'cde', 'c', 'de', 'e', 'd', 'f', 'ef', 'cd', 'cdef', 'def'])

不,你对长度为1的子集不感兴趣。但是,在进行set.add()调用之前,你总是可以添加一个长度限制。