Question

我正在尝试编写一个包含两个字符串的函数：一个是细菌的“基因组”，例如ATACGAGA或类似的东西。另一个字符串是必须在基因组中找到的DNA序列，例如ATA。该功能应该返回将DNA序列与基因组进行比较时发现的最佳相似性评分。因此，由于ATA为3个字符，因此它将ATA与基因组中的前三个字符进行比较，然后将其与第二，第三和第四个字符进行比较，然后将其与第三，第四和第五个字符进行比较，依此类推。由于ATA是基因组的前三个字符，因此相似性得分将为1。例如，如果我要查找的子字符串是ATG，则由于2/3个字符相同，则返回0.66。

到目前为止，我的函数准确地返回了第一个for循环的每次迭代的simScore。例如，如果基因组字符串是ABCABC，子字符串是ABC，则由于ABC与那些位置的基因组完全匹配，因此它准确地计算出i = 0时的simScore和i = 3时的i = 3。但是，要找到最佳的simScore，我需要以某种方式找到for循环的每次迭代的simScore，或者： 1.每次迭代将其与先前的simScore进行比较，并始终保留较高的simScore，或者 2：将每个新的simScore存储在一个新变量中，然后比较所有simScore以确定最高的simScore。

double findBestSimScore(string genome, string sequence)
{
    double simScore; // similarity score for the genome and the substring
    double newSimScore;
    double bestSimScore;
    int differences = 0;

    if(genome == "" || sequence == "")
    {
        return 0;
    }
    else if(genome.length() < sequence.length())
    {
        return 0;
    }
    else
    {
        for(int i = 0; i < genome.length(); i++)
        {
            string sub = genome.substr(i, sequence.length()); 
            differences = 0;


            for(int c = 0; c < sequence.length(); c++)
            {
                if(sub[c] != sequence[c])
                {
                    differences++;
                }
            }

            simScore = (sequence.length() - differences) / (sequence.length() * 1.0); // This calculation needs to be done in each iteration of the first for loop and stored as a new variable, or 
                                                                                      // It could also work if each time the simScore is calculated, it is compared to the previous simScore to see if it is larger
        }

        /*
        Need if else statements here that compare each simsScore and determine the highest simScore
        */

        return bestSimScore;
    }
}

int main()
{

    //test 1 
    //
    cout << findBestSimScore("ABCABC", "ABT") << endl;

Answer 1

我会通过从负位置开始在整个基因组中移动序列来完成此操作（原始代码中的索引无效）。如果发现更好的bestSimScore = newSimScore > bestSimScore ? newSimScore : bestSimScore;（交换存储要复杂得多），则会交换最佳分数（这是您的问题）。可以使用1.0分数来停止循环：

double findBestSimScore(string genome, string sequence)
{

  if (genome == "" || sequence == "")
  {
    return 0;
  }

  double bestSimScore = 0;
  // move virtually the sequence string through the genome starting at a negative position! 
  for (int i = -(int)sequence.length() + 1; i < (int)genome.length(); i++)
  {
    int matchCount = 0;
    for (int j = 0; j < sequence.length(); j++)
    {
      int genomeIndex = i + j;
      if (genomeIndex >= 0 && genomeIndex < genome.length()) // do not check if positions are out of range ()
      {
        matchCount += genome[genomeIndex] == sequence[j] ? 1 : 0;
      }
    }
    // calculate new score and compare to previous
    double newSimScore = (double)matchCount / (double)sequence.length();
    bestSimScore = newSimScore > bestSimScore ? newSimScore : bestSimScore;

    if (bestSimScore == 1.0) // if we found a full match: we can stop 
    {
      break;
    }
  }
  return bestSimScore;
}

int main()
{
  assert(findBestSimScore("ABCABC", "ABT") == 2.0 / 3.0);
  assert(findBestSimScore("ABCABC", "ABC") == 1.0);
  assert(findBestSimScore("ABCABC", "BC") == 1.0);
  assert(findBestSimScore("ABCABC", "ABCABC") == 1.0);
  assert(findBestSimScore("ABCABD", "BDE") == 2.0 / 3.0);
  assert(findBestSimScore("BDE", "ABCABD") == 1.0 / 3.0);
  return 0;
}

在for循环的每次迭代中创建一个新变量

1 个答案: