以最高分打印两个序列的所有比对

时间:2014-07-24 09:02:14

标签: c# algorithm data-structures dynamic-programming

Sequence Alignment是一个非常标准的问题,可用于生物信息学领域的DNA或蛋白质比对。我最近遇到了这个问题的不同版本。

给定两个输入字符串(假设字符串仅由A,C,G,T组成),问题基本上是根据以下矩阵找到最大对齐分数 -

   A  C  G  T  -
A  5 -1 -2 -1 -3  
C -1  5 -3 -2 -4
G -2 -3  5 -2 -2
T -1 -2 -2  5 -1
- -3 -4 -2 -1 Not Allowed

所以,如果A与 - 对齐,我们在对齐分数上加-3,或者如果G与T对齐,我们在分数上加-2,或者如果C与C对齐,我们加5。 因此,对于输入字符串AGTGATG和GTTAG,最大比对分数为14,其中一个具有最大分数的比对可以表示为

AGTGATG
-GTTA-G

对齐得分计算如下:A- = -3,GG = 5,TT = 5,GT = -2,AA = 5,T- = -1和GG = 5.将它们加起来,-3 + 5 + 5-2 + 5-1 + 5 = 14这是这对弦的最大可能对齐分数。

我能够使用动态编程对其进行编码并获得Alignment得分矩阵,但是我在打印两个字符串的所有可能对齐方面遇到了问题,并且最大对齐得分。我试图像在LCS中一样回溯,但无法使其正常工作。我附上了我的代码。

static Dictionary<string, int> dict;

    static void Main(string[] args)
    {
        //This has been assumed that the strings contain only A,C,G,T and -(?)..caps

        Console.WriteLine("Enter first string : ");
        string a = Console.ReadLine();
        a = "-" + a;
        Console.WriteLine("Enter second string : ");
        string b = Console.ReadLine();
        b = "-" + b;
        int[,] SQ = new int[a.Length, b.Length];
        #region Create Dictionary
        dict = new Dictionary<string, int>();
        dict.Add("AA", 5);
        dict.Add("AC", -1);
        dict.Add("AG", -2);
        dict.Add("AT", -1);
        dict.Add("A-", -3);

        dict.Add("CA", -1);
        dict.Add("CC", 5);
        dict.Add("CG", -3);
        dict.Add("CT", -2);
        dict.Add("C-", -4);

        dict.Add("GA", -2);
        dict.Add("GC", -3);
        dict.Add("GG", 5);
        dict.Add("GT", -2);
        dict.Add("G-", -2);

        dict.Add("TA", -1);
        dict.Add("TC", -2);
        dict.Add("TG", -2);
        dict.Add("TT", 5);
        dict.Add("T-", -1);

        dict.Add("-A", -3);
        dict.Add("-C", -4);
        dict.Add("-G", -2);
        dict.Add("-T", -1);
        dict.Add("--", 0);
        #endregion Create Dictionary

        for (int i = 0; i < a.Length; i++)
        {
            for (int j = 0; j < b.Length; j++)
            {
                int key = 0, key1 = 0, key2 = 0;
                dict.TryGetValue(a[i].ToString() + b[j].ToString(), out key);
                dict.TryGetValue("-" + b[j].ToString(), out key1);
                dict.TryGetValue(a[i].ToString() + "-", out key2);
                if (i == 0)
                    SQ[i, j] = key1;
                else if (j == 0)
                    SQ[i, j] = key2;
                else
                    SQ[i, j] = Math.Max(SQ[i - 1, j - 1] + key, Math.Max(SQ[i - 1, j] + key1, SQ[i, j - 1] + key2));
            }
        }
        for (int i = 0; i < a.Length; i++)
        {
            for (int j = 0; j < b.Length; j++)
            {
                Console.Write(SQ[i, j] + "   ");
            }
            Console.WriteLine();
        }

        Console.WriteLine("Alignment Score : " + SQ[a.Length - 1, b.Length - 1]);            
        printAllAlignmentsWithHighestAlignmentScore();
        Console.Read();
    }

有人可以帮我实现printAllAlignmentsWithHighestAlignmentScore()函数吗?

4 个答案:

答案 0 :(得分:2)

最后,我有工作代码来完成我想做的事情。 问题实际上是Needleman–Wunsch algorithm

的轻微变化

代码:

class Program
{
    static Dictionary<string, int> dict;
    static void printAllAlignments(int[,] SQ, string a, string b, int p, int q, string str1, string str2){
        if (p == 0 || q == 0){
            while (p == 0 && q != 0){
                str1 = "-" + str1;
                str2 = b[--q]+str2;
            }
            while (q == 0 && p != 0){
                str1 = a[--p]+str1;
                str2 = '-' + str2;
            }
            Console.WriteLine("\n"+str1+"\n"+str2+"\n");
            return;
        }

        if (SQ[p, q] == (dict[a[p - 1] + b[q - 1].ToString()] + SQ[p - 1, q - 1]))
            printAllAlignments(SQ, a, b, p - 1, q - 1, a[p-1]+str1, b[q-1]+str2);

        if (SQ[p, q] == (dict[a[p - 1]+ "-"] + SQ[p - 1, q]))
            printAllAlignments(SQ, a, b, p - 1, q, a[p-1]+str1, "-"+str2);

        if (SQ[p, q] == (dict["-" + b[q-1]] + SQ[p, q - 1]))            
            printAllAlignments(SQ, a, b, p, q - 1, "-"+str1, b[q-1]+str2);


    }
    static void Main(string[] args)
    {
        //This has been assumed that the strings contain only A,C,G,T and -(?)..caps

        Console.WriteLine("Enter first string : ");
        string a = Console.ReadLine();         
        Console.WriteLine("Enter second string : ");
        string b = Console.ReadLine();          
        int[,] SQ = new int[a.Length+1, b.Length+1];

        #region Create Dictionary
        dict = new Dictionary<string, int>();
        dict.Add("AA", 5);
        dict.Add("AC", -1);
        dict.Add("AG", -2);
        dict.Add("AT", -1);
        dict.Add("A-", -3);

        dict.Add("CA", -1);
        dict.Add("CC", 5);
        dict.Add("CG", -3);
        dict.Add("CT", -2);
        dict.Add("C-", -4);

        dict.Add("GA", -2);
        dict.Add("GC", -3);
        dict.Add("GG", 5);
        dict.Add("GT", -2);
        dict.Add("G-", -2);

        dict.Add("TA", -1);
        dict.Add("TC", -2);
        dict.Add("TG", -2);
        dict.Add("TT", 5);
        dict.Add("T-", -1);

        dict.Add("-A", -3);
        dict.Add("-C", -4);
        dict.Add("-G", -2);
        dict.Add("-T", -1);
        dict.Add("--", 0);
        #endregion Create Dictionary

        SQ[0, 0] = 0;            
        for (int i = 1; i <= a.Length; i++)            
            SQ[i, 0] = dict["-" + a[i - 1].ToString()] + SQ[i - 1, 0];

        for (int i = 1; i <= b.Length; i++)
            SQ[0, i] = dict[b[i - 1].ToString() + "-"] + SQ[0, i - 1];

        for (int i = 1; i <= a.Length; i++)
            for (int j = 1; j <= b.Length; j++)
                SQ[i, j] = Math.Max(SQ[i - 1, j - 1] + dict[a[i-1].ToString() + b[j-1]], Math.Max(SQ[i - 1, j] + dict[a[i-1] + "-"], SQ[i, j - 1] + dict["-" + b[j-1]]));           


        Console.WriteLine("Max Alignment Score : " + SQ[a.Length, b.Length]);
        printAllAlignments(SQ, a, b, a.Length , b.Length,"","");
        Console.Read();
    }
}

答案 1 :(得分:0)

有趣的问题。 &#34;动态编程&#34;在哪里?你的代码?

我不确定您在打印所有可能的路线时所寻找的是什么,但下面的快速和脏代码可能有所帮助。它将每个对齐打印在两行上,如下所示:

- 0
-

- -2
G

.
.
.

AGTGATG 8
-GTTTTA

AGTGATTG 14
-GTTTTAG

请注意,您提到的最大对齐组合并未显示在此输出中:

AGTGATG
-GTTA-G

这是什么意思&#34;打印所有可能的对齐问题&#34;?

无论如何,我的代码(删除了字典初始化):

public struct Alignment
{
    public string substringA;
    public string substringB;
    public int key;
}

[MTAThread]
static void Main(string[] args)
{
    //This has been assumed that the strings contain only A,C,G,T and -(?)..caps

    Console.WriteLine("Enter first string : ");
    var a = Console.ReadLine();
    a = "-" + a;
    Console.WriteLine("Enter second string : ");
    var b = Console.ReadLine();
    b = "-" + b;
    Alignment[,] SQ = new Alignment[a.Length, b.Length];

    #region Create Dictionary
    ...
    #endregion Create Dictionary

    for (int i = 0; i < a.Length; i++)
    {
        for (int j = 0; j < b.Length; j++)
        {
            int key = 0, key1 = 0, key2 = 0;
            dict.TryGetValue(a[i].ToString() + b[j].ToString(), out key);
            dict.TryGetValue("-" + b[j].ToString(), out key1);
            dict.TryGetValue(a[i].ToString() + "-", out key2);
            if (i == 0)
            {
                SQ[i, j].substringA = "-";
                SQ[i, j].substringB = b[j].ToString();
                SQ[i, j].key = key1;
            }
            else if (j == 0)
            {
                SQ[i, j].substringA = a[i].ToString();
                SQ[i, j].substringB = "-";
                SQ[i, j].key = key2;
            }
            else
            {
                // Get the maximum key value, and the substrings associated with it.
                int score;
                var score1 = SQ[i - 1, j].key + key1;
                var score2 = SQ[i, j - 1].key + key2;
                if (score1 >= score2)
                {
                    SQ[i, j].substringA = SQ[i - 1, j].substringA;
                    SQ[i, j].substringB = SQ[i - 1, j].substringB;
                    score = score1;
                }
                else
                {
                    SQ[i, j].substringA = SQ[i, j - 1].substringA;
                    SQ[i, j].substringB = SQ[i, j - 1].substringB;
                    score = score2;
                }

                var score3 = SQ[i - 1, j - 1].key + key;
                if (score3 >= score)
                {
                    SQ[i, j].substringA = SQ[i - 1, j - 1].substringA;
                    SQ[i, j].substringB = SQ[i - 1, j - 1].substringB;
                    score = score3;
                }
                SQ[i, j].substringA += a[i];
                SQ[i, j].substringB += b[j];
                SQ[i, j].key = score;
            }
        }
    }

    PrintAlignments(SQ, a.Length, b.Length);

    Console.WriteLine("Alignment Score : " + SQ[a.Length - 1, b.Length - 1].key);            
    Console.Read();
}

private static void PrintAlignments(Alignment[,] SQ, int iLength, int jLength)
{
    for (int i = 0; i < iLength; i++)
    {
        for (int j = 0; j < jLength; j++)
        {
            Console.WriteLine("{0} {1}", SQ[i, j].substringA, SQ[i, j].key);
            Console.WriteLine("{0}", SQ[i, j].substringB);
            Console.WriteLine();
        }
    }
}

答案 2 :(得分:0)

每个状态(DP单元)X有3个前驱状态;我们称之为X1,X2,X3。让我们调用状态X s(X)的得分,以及从状态X到某个邻近状态Y c(X,Y)的成本。

对于任何给定的状态X,通常只有其中一个前辈最优,当根据其自己的分数加上从其获得X的成本来衡量时。可能是s(X1)+ c(X1,X)> s(X2)+ c(X2,X)和s(X1)+ c(X1,X)> s(X3)+ c(X3,X):在这种情况下,X1在X的3个前驱中是唯一最优的,并且回溯只需要跟随X1。

但可能会发生2个最佳平等,甚至3个最佳平等的前辈,并且在这些情况下,遵循2或3个最佳前任中的任何一个将(通过DP公式的正确性)产生最优对准。通常,您只对生成单个对齐感兴趣,因此在回溯过程中,您只需随意选择一个最佳前导,然后按照它进行操作。但是如果你想要全部生成它们,而不是这样做,你需要遍历所有的前任状态并递归处理每个状态。

答案 3 :(得分:0)

仅仅因为问题对我来说真的很有意思,而且我对你的解决方案感兴趣,我很快为一些测试编写了一个强力解决方案。对于一些微不足道的问题,我发现我们会产生不同的解决方例如TA和AT。你的分数结果是1(我不知道它所基于的对齐方式),而我的是:

TA-
-AT

得分为3.也许我没有把你的问题弄好,如果你检查出来,我会很高兴的。

static Dictionary<string, int> dict;

    static void Main(string[] args)
    {
        //This has been assumed that the strings contain only A,C,G,T and -(?)..caps

        Console.WriteLine("Enter first string : ");
        string realInputA = Console.ReadLine();
        string inputA = "-" + realInputA;
        Console.WriteLine("Enter second string : ");
        string realInputB = Console.ReadLine();
        string inputB = "-" + realInputB;
        int[,] scoreMatrix = new int[inputA.Length, inputB.Length];
        #region Create Dictionary
        dict = new Dictionary<string, int>();
        dict.Add("AA", 5);
        dict.Add("AC", -1);
        dict.Add("AG", -2);
        dict.Add("AT", -1);
        dict.Add("A-", -3);

        dict.Add("CA", -1);
        dict.Add("CC", 5);
        dict.Add("CG", -3);
        dict.Add("CT", -2);
        dict.Add("C-", -4);

        dict.Add("GA", -2);
        dict.Add("GC", -3);
        dict.Add("GG", 5);
        dict.Add("GT", -2);
        dict.Add("G-", -2);

        dict.Add("TA", -1);
        dict.Add("TC", -2);
        dict.Add("TG", -2);
        dict.Add("TT", 5);
        dict.Add("T-", -1);

        dict.Add("-A", -3);
        dict.Add("-C", -4);
        dict.Add("-G", -2);
        dict.Add("-T", -1);
        dict.Add("--", 0);
        #endregion Create Dictionary

        for (int i = 0; i < inputA.Length; i++)
        {
            for (int j = 0; j < inputB.Length; j++)
            {
                int score = 0, score1 = 0, score2 = 0;
                dict.TryGetValue(inputA[i].ToString() + inputB[j].ToString(), out score);
                dict.TryGetValue("-" + inputB[j].ToString(), out score1);
                dict.TryGetValue(inputA[i].ToString() + "-", out score2);
                if (i == 0)
                    scoreMatrix[i, j] = score1;
                else if (j == 0)
                    scoreMatrix[i, j] = score2;
                else
                    scoreMatrix[i, j] = Math.Max(scoreMatrix[i - 1, j - 1] + score, Math.Max(scoreMatrix[i - 1, j] + score1, scoreMatrix[i, j - 1] + score2));
            }
        }
        for (int i = 0; i < inputA.Length; i++)
        {
            for (int j = 0; j < inputB.Length; j++)
            {
                Console.Write(scoreMatrix[i, j] + "   ");
            }
            Console.WriteLine();
        }

        Console.WriteLine("Alignment Score : " + scoreMatrix[inputA.Length - 1, inputB.Length - 1]);
        printAllAlignments(realInputA, realInputB);
        Console.Read();
    }

    static void printAllAlignments(string inputA, string inputB)
    {
        int minLen = Math.Max(inputA.Length, inputB.Length);
        int maxLen = inputA.Replace("-", "").Length + inputB.Replace("-", "").Length;

        Dictionary<string, int> solutions = new Dictionary<string, int>();

        solutions = prepareStartSequences(inputA, inputB, minLen, solutions);

        addLongerSequences(inputA, minLen, maxLen, solutions);

        var solutionsOrdered = solutions.OrderByDescending(x => x.Value);
        int maxScore = solutionsOrdered.First().Value;

        foreach (var sol in solutionsOrdered.Where(x => x.Value == maxScore))
        {
            Console.WriteLine("{0}\n{1}\tScore: {2}\n\n", sol.Key.Split('|')[0], sol.Key.Split('|')[1], sol.Value);
        }
    }

    private static void addLongerSequences(string inputA, int minLen, int maxLen, Dictionary<string, int> solutions)
    {
        for (int l = minLen + 1; l <= maxLen; l++)
        {
            List<Tuple<string, string>> currCombs = solutions
                .Where(x => x.Key.Length / 2 + 1 == l)
                .Select(x => x.Key.Split('|'))
                .Select(x => new Tuple<string, string>(x[0], x[1]))
                .ToList();
            foreach (var comb in currCombs)
            {
                for (int idxA = 0; idxA <= inputA.Length; idxA++)
                {
                    for (int idxB = 0; idxB <= inputA.Length; idxB++)
                    {
                        string cA = comb.Item1.Insert(idxA, "-");
                        string cB = comb.Item2.Insert(idxB, "-");
                        int score = getScore(cA, cB);
                        string key = cA + "|" + cB;
                        if (!solutions.ContainsKey(key) && score > int.MinValue)
                        {
                            solutions.Add(key, score);
                        }
                    }
                }
            }
        }
    }

    private static Dictionary<string, int> prepareStartSequences(string inputA, string inputB, int minLen, Dictionary<string, int> solutions)
    {
        if (inputA.Length == inputB.Length)
            solutions.Add(inputA + "|" + inputB, getScore(inputA, inputB));
        else
        {
            string shorter = inputA.Length > inputB.Length ? inputB : inputA;
            string longer = inputA.Length > inputB.Length ? inputA : inputB;
            int shortLen = shorter.Length;
            List<string> combinations = new List<string>();
            combinations.Add(shorter);

            while (shortLen < minLen)
            {
                List<string> tmpCombinations = new List<string>();
                foreach (string str in combinations)
                {
                    for (int i = 0; i <= shortLen; i++)
                    {
                        tmpCombinations.Add(str.Insert(i, "-"));
                    }
                }
                combinations = tmpCombinations.Distinct().ToList();
                shortLen++;
            }

            foreach (var comb in combinations)
            {
                if (inputA.Length > inputB.Length)
                {
                    solutions.Add(longer + "|" + comb, getScore(longer, comb));
                }
                else
                {
                    solutions.Add(comb + "|" + longer, getScore(comb, longer));
                }
            }
        }

        solutions = solutions.Where(x => x.Value != int.MinValue).ToDictionary(x => x.Key, y => y.Value);
        return solutions;
    }

    static int getScore(string inputA, string inputB)
    {
        int result = 0;
        for (int i = 0; i < inputA.Length; i++)
        {
            string key = inputA[i].ToString() + inputB[i].ToString();
            if (key == "--") return int.MinValue;
            result += dict.ContainsKey(key) ? dict[key] : 0;
        }
        return result;
    }