查找字符串中包含最大长度的所有有序序列

时间:2013-08-12 15:42:37

标签: c++ string algorithm c++11 pattern-matching

我有以下问题要解决: 有两个任意长度的字符串,任意内容。我需要找到所有有序序列的最大长度,它出现在两个字符串中。

示例1: 输入:“a1b2c3”“1a2b3c” 输出:“123”“12c”“1b3”“1bc”“a23”“a2c”“ab3”“abc”

示例2: 输入:“cadb”“abcd” 输出:“ab”“ad”“cd”

我用直接的方式写了两个循环,递归,然后删除重复项和结果,这是更大结果的一部分(例如“abc”序列包含“ab”“ac”和“bc”序列,所以我正在过滤那些)

// "match" argument here used as temporary buffer
void match_recursive(set<string> &matches, string &match, const string &a_str1, const string &a_str2, size_t a_pos1, size_t a_pos2)
{
    bool added = false;

    for(size_t i = a_pos1; i < a_str1.length(); ++i)
    {
        for(size_t j = a_pos2; j < a_str2.length(); ++j)
        {
            if(a_str1[i] == a_str2[j])
            {
                match.push_back(a_str1[i]);

                if(i < a_str1.length() - 1 && j < a_str2.length() - 1)
                    match_recursive(matches, match, a_str1, a_str2, i + 1, j + 1);
                else
                    matches.emplace(match);
                added = true;

                match.pop_back();
            }
        }
    }

    if(!added)
        matches.emplace(match);
}

此功能解决了问题,但复杂性是不可接受的。例如,“我们的机器上的”0:00 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0我认为这个问题应该有一些简单的算法,但不知怎的,我在网上找不到任何算法。

你们能指出我正确的方向吗?

3 个答案:

答案 0 :(得分:2)

查找“最长公共子序列(LCS)”问题,例如http://en.wikipedia.org/wiki/Longest_common_subsequence_problem并了解动态编程解决方案如何工作以找到两个序列的LCS,基于有效构建解决方案,从简单地获取每个序列的第一个字符的LCS,然后构建LCS解决方案对于两个序列的更长和更长的前缀对。您需要做的唯一修改是,当您从先前为先前前缀对计算的LCS解决方案获得当前前缀对的LCS时,您需要为先前的前缀对存储所有先前的LCS字符串,然后组合这些集合将LCS字符串(可能具有添加的字符)串联到您为当前前缀对存储的整个LCS字符串集中。这将有效地解决您的问题。您可以通过首先获得单个LCS并获得整体LCS长度,然后查找所有早期前缀对(有助于获得LCS长度的计算路径)然后返回并重复动态编程迭代来更有效地解决问题。只是为那些前缀对,这次跟踪我之前描述的所有可能的LCS序列。

答案 1 :(得分:0)

听起来你正试图找到2个字符串之间的相似之处?多年前我在网上的某个地方找到了这个代码并进行了稍微修改(抱歉,我不能再引用它了)并经常使用它。它工作得非常快(无论如何都是字符串)。您可能需要根据自己的需要进行更改。对不起,这是在VB。

Private Shared piScore As Integer
''' <summary>
''' Compares two not-empty strings regardless of case. 
''' Returns a numeric indication of their similarity 
''' (0 = not at all similar, 100 = identical)
''' </summary>
''' <param name="psStr1">String to compare</param>
''' <param name="psStr2">String to compare</param>
''' <returns>0-100 (0 = not at all similar, 100 = identical)</returns>
''' <remarks></remarks>
Public Shared Function Similar(ByVal psStr1 As String, ByVal psStr2 As String) As Integer
    If psStr1 Is Nothing Or psStr2 Is Nothing Then Return 0

    ' Convert each string to simplest form (letters
    ' and digits only, all upper case)
    psStr1 = ReplaceSpecial(psStr1.ToUpper)
    psStr2 = ReplaceSpecial(psStr2.ToUpper)

    If psStr1.Trim = "" Or psStr2.Trim = "" Then
        ' One or both of the strings is now empty
        Return 0
    End If

    If psStr1 = psStr2 Then
        ' Strings are identical
        Return 100
    End If

    ' Initialize cumulative score (this will be the
    ' total length of all the common substrings)
    piScore = 0

    ' Find all common sub-strings
    FindCommon(psStr1, psStr2)

    ' We now have the cumulative score. Return this
    ' as a percent of the maximum score. The maximum
    ' score is the average length of the two strings.
    Return piScore * 200 / (Len(psStr1) + Len(psStr2))

End Function

''' <summary>USED BY SIMILAR FUNCTION</summary>
Private Shared Sub FindCommon(ByVal psS1 As String, ByVal psS2 As String)
    ' Finds longest common substring (other than single
    ' characters) in psS1 and psS2, then recursively
    ' finds longest common substring in left-hand
    ' portion and right-hand portion. Updates the
    ' cumulative score.

    Dim iLongest As Integer = 0, iStartPos1 As Integer = 0, iStartPos2 As Integer = 0, iJ As Integer = 0
    Dim sHoldStr As String = "", sTestStr As String = "", sLeftStr1 As String = "", sLeftStr2 As String = ""
    Dim sRightStr1 As String = "", sRightStr2 As String = ""

    sHoldStr = psS2
    Do While Len(sHoldStr) > iLongest

        sTestStr = sHoldStr
        Do While Len(sTestStr) > 1
            iJ = InStr(psS1, sTestStr)
            If iJ > 0 Then
                ' Test string is sub-set of the other string

                If Len(sTestStr) > iLongest Then
                    ' Test string is longer than previous
                    ' longest. Store its length and position.
                    iLongest = Len(sTestStr)
                    iStartPos1 = iJ
                    iStartPos2 = InStr(psS2, sTestStr)
                End If

                ' No point in going further with this string
                Exit Do

            Else
                ' Test string is not a sub-set of the other
                ' string. Discard final character of test
                ' string and try again.
                sTestStr = Left(sTestStr, Len(sTestStr) - 1)
            End If

        Loop

        ' Now discard first char of test string and
        ' repeat the process.
        sHoldStr = Right(sHoldStr, Len(sHoldStr) - 1)

    Loop

    ' Update the cumulative score with the length of
    ' the common sub-string.
    piScore = piScore + iLongest

    ' We now have the longest common sub-string, so we
    ' can isolate the sub-strings to the left and right
    ' of it.

    If iStartPos1 > 3 And iStartPos2 > 3 Then
        sLeftStr1 = Left(psS1, iStartPos1 - 1)
        sLeftStr2 = Left(psS2, iStartPos2 - 1)

        If sLeftStr1.Trim <> "" And sLeftStr2.Trim <> "" Then
            ' Get longest common substring from left strings
            FindCommon(sLeftStr1, sLeftStr2)
        End If
    Else
        sLeftStr1 = ""
        sLeftStr2 = ""
    End If
    If iLongest > 0 Then
        sRightStr1 = Mid(psS1, iStartPos1 + iLongest)
        sRightStr2 = Mid(psS2, iStartPos2 + iLongest)

        If sRightStr1.Trim <> "" And sRightStr2.Trim <> "" Then
            ' Get longest common substring from right strings
            FindCommon(sRightStr1, sRightStr2)
        End If
    Else
        sRightStr1 = ""
        sRightStr2 = ""
    End If
End Sub

''' <summary>USED BY SIMILAR FUNCTION</summary>
Private Shared Function ReplaceSpecial(ByVal sString As String) As String
    Dim iPos As Integer
    Dim sReturn As String = ""
    Dim iAsc As Integer
    For iPos = 1 To sString.Length
        iAsc = Asc(Mid(sString, iPos, 1))
        If (iAsc >= 48 And iAsc <= 57) Or (iAsc >= 65 And iAsc <= 90) Then
            sReturn &= Chr(iAsc)
        End If
    Next
    Return sReturn
End Function

只需调用Similar函数,就会得到0到100之间的结果。

希望这有帮助

答案 2 :(得分:0)

以下是动态编程解决方案的代码。我用你给出的例子测试它。我已经解决了LCS问题,但这是第一次打印它们。

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <set>

using namespace std;

#define MAX_LENGTH 100

int lcs(const char* a, const char* b)
{
    int row = strlen(a)+ 1;
    int column = strlen(b) + 1;

    //Memoization lower the function's time cost in exchange for space cost.
    int **matrix = (int**)malloc(sizeof(int*) * row);
    int i, j;
    for(i = 0; i < row; ++i)
        matrix[i] = (int*)calloc(sizeof(int), column);
    typedef set<string> lcs_set;

    lcs_set s_matrix[MAX_LENGTH][MAX_LENGTH];

    //initiate
    for(i = 0; i < MAX_LENGTH ; ++i)
        s_matrix[0][i].insert("");
    for(i = 0; i < MAX_LENGTH ; ++i)
        s_matrix[i][0].insert("");

    //Bottom up calculation
    for(i = 1; i < row; ++i)
    {
        for(j = 1; j < column; ++j)
        {
            if(a[i - 1] == b[j - 1])
            {
                matrix[i][j] = matrix[i -1][j - 1] + 1;
                // if your compiler support c++ 11, you can simplify this code.
                for(lcs_set::iterator it = s_matrix[i - 1][j - 1].begin(); it != s_matrix[i - 1][j - 1].end(); ++it)
                    s_matrix[i][j].insert(*it + a[i - 1]);
            }
            else
            {
                if(matrix[i][j - 1] > matrix[i - 1][j])
                {
                    matrix[i][j] = matrix[i][j - 1];
                    for(lcs_set::iterator it = s_matrix[i][j - 1].begin(); it != s_matrix[i][j - 1].end(); ++it)
                        s_matrix[i][j].insert(*it);
                }
                else if(matrix[i][j - 1] == matrix[i - 1][j])
                {
                    matrix[i][j] = matrix[i][j - 1];
                    for(lcs_set::iterator it = s_matrix[i][j - 1].begin(); it != s_matrix[i][j - 1].end(); ++it)
                        s_matrix[i][j].insert(*it);
                    for(lcs_set::iterator it = s_matrix[i - 1][j].begin(); it != s_matrix[i - 1][j].end(); ++it)
                        s_matrix[i][j].insert(*it);
                }
                else
                {
                    matrix[i][j] = matrix[i - 1][j];
                    for(lcs_set::iterator it = s_matrix[i - 1][j].begin(); it != s_matrix[i - 1][j].end(); ++it)
                        s_matrix[i][j].insert(*it);
                }

            }
        }
    }
    int lcs_length = matrix[row - 1][column -1];
    // all ordered sequences with maximum length are here.
    lcs_set result_set;

    int m, n;
    for(m = 1; m < row; ++m)
    {
        for(n = 1; n < column; ++n)
        {
            if(matrix[m][n] == lcs_length)
            {
                for(lcs_set::iterator it = s_matrix[m][n].begin(); it != s_matrix[m][n].end(); ++it)
                    result_set.insert(*it);
            }
        }
    }

    //comment it
    for(lcs_set::iterator it = result_set.begin(); it != result_set.end(); ++it)
        printf("%s\t", it->c_str());
    printf("\n");

    for(i = 0; i < row; ++i)
        free(matrix[i]);
    free(matrix);

    return lcs_length;
}

int main()
{
    char buf1[MAX_LENGTH], buf2[MAX_LENGTH];
    while(scanf("%s %s", buf1, buf2) != EOF)
    {
        printf("length is: %d\n", lcs(buf1, buf2) );
    }
    return 0;
}