我正在尝试编写一个包含两个字符串的函数:一个是细菌的“基因组”,例如ATACGAGA或类似的东西。另一个字符串是必须在基因组中找到的DNA序列,例如ATA。该功能应该返回将DNA序列与基因组进行比较时发现的最佳相似性评分。因此,由于ATA为3个字符,因此它将ATA与基因组中的前三个字符进行比较,然后将其与第二,第三和第四个字符进行比较,然后将其与第三,第四和第五个字符进行比较,依此类推。由于ATA是基因组的前三个字符,因此相似性得分将为1。例如,如果我要查找的子字符串是ATG,则由于2/3个字符相同,则返回0.66。
到目前为止,我的函数准确地返回了第一个for循环的每次迭代的simScore。例如,如果基因组字符串是ABCABC,子字符串是ABC,则由于ABC与那些位置的基因组完全匹配,因此它准确地计算出i = 0时的simScore和i = 3时的i = 3。但是,要找到最佳的simScore,我需要以某种方式找到for循环的每次迭代的simScore,或者: 1.每次迭代将其与先前的simScore进行比较,并始终保留较高的simScore,或者 2:将每个新的simScore存储在一个新变量中,然后比较所有simScore以确定最高的simScore。
double findBestSimScore(string genome, string sequence)
{
double simScore; // similarity score for the genome and the substring
double newSimScore;
double bestSimScore;
int differences = 0;
if(genome == "" || sequence == "")
{
return 0;
}
else if(genome.length() < sequence.length())
{
return 0;
}
else
{
for(int i = 0; i < genome.length(); i++)
{
string sub = genome.substr(i, sequence.length());
differences = 0;
for(int c = 0; c < sequence.length(); c++)
{
if(sub[c] != sequence[c])
{
differences++;
}
}
simScore = (sequence.length() - differences) / (sequence.length() * 1.0); // This calculation needs to be done in each iteration of the first for loop and stored as a new variable, or
// It could also work if each time the simScore is calculated, it is compared to the previous simScore to see if it is larger
}
/*
Need if else statements here that compare each simsScore and determine the highest simScore
*/
return bestSimScore;
}
}
int main()
{
//test 1
//
cout << findBestSimScore("ABCABC", "ABT") << endl;
答案 0 :(得分:0)
我会通过从负位置开始在整个基因组中移动序列来完成此操作(原始代码中的索引无效)。如果发现更好的bestSimScore = newSimScore > bestSimScore ? newSimScore : bestSimScore;
(交换存储要复杂得多),则会交换最佳分数(这是您的问题)。可以使用1.0
分数来停止循环:
double findBestSimScore(string genome, string sequence)
{
if (genome == "" || sequence == "")
{
return 0;
}
double bestSimScore = 0;
// move virtually the sequence string through the genome starting at a negative position!
for (int i = -(int)sequence.length() + 1; i < (int)genome.length(); i++)
{
int matchCount = 0;
for (int j = 0; j < sequence.length(); j++)
{
int genomeIndex = i + j;
if (genomeIndex >= 0 && genomeIndex < genome.length()) // do not check if positions are out of range ()
{
matchCount += genome[genomeIndex] == sequence[j] ? 1 : 0;
}
}
// calculate new score and compare to previous
double newSimScore = (double)matchCount / (double)sequence.length();
bestSimScore = newSimScore > bestSimScore ? newSimScore : bestSimScore;
if (bestSimScore == 1.0) // if we found a full match: we can stop
{
break;
}
}
return bestSimScore;
}
int main()
{
assert(findBestSimScore("ABCABC", "ABT") == 2.0 / 3.0);
assert(findBestSimScore("ABCABC", "ABC") == 1.0);
assert(findBestSimScore("ABCABC", "BC") == 1.0);
assert(findBestSimScore("ABCABC", "ABCABC") == 1.0);
assert(findBestSimScore("ABCABD", "BDE") == 2.0 / 3.0);
assert(findBestSimScore("BDE", "ABCABD") == 1.0 / 3.0);
return 0;
}