搜索字符串中的序列。脱氧核糖核酸

时间:2015-03-25 13:08:51

标签: c++ string algorithm search dna-sequence

我需要做一个程序,将3与字符串的大小分开,并与给定的相同字符串中的3个序列进行比较。我要解释一下。

用户介绍此DNA字符串=“ACTGCGACGGTACGCTTCGACGTAG”例如。 我们从n = 3开始,这就是我们在DNA中进行比较的前三个字符。

第一个字符是:“ACT”,我们需要将它与其他三个序列进行比较,例如,[CTG,TGC,GCA ......直到结束]。

如果我们发现另一个等于“ACT”的序列,我们保存位置。 这是另一个例子:

DNA:“ACTGCGACGGTACGCTTCGACGTAG”,我们发现这个序列在他的位置:

  1. ACG:7 - 12 - 20
  2. CGA:5 - 18
  3. GAC:6 - 19
  4. GTA:10 - 22
  5. CGAC:5 - 18
  6. GACG:6 - 19
  7. CGACG:5 - 18 数字是序列开头的位置:
  8. ACTGCG的 ACG GT的 ACG CTTCG的 ACG TAG

    你可以看到n = 3,当我们以n = 3结束时,增量为1,变量传递给n = 4,直到n = DNA.size()。

    我的问题是我有一个函数可以在DNA的一小部分序列中分割字符串,并且我执行了push_back()以保存在向量中,然后我可以看到是否有更多的序列,但是我不知道我怎么能得到这个职位。

    我可以使用库算法,当然,在这个库中有一个函数可以做到这一点,但我不太了解这个库。

    这是我的代码:

    #include <iostream>
    #include <string>
    #include <vector>
    #include <algorithm>
    
    using namespace std;
    
    const string DNA = "ACTGCGACGGTACGCTTCGACGTAG";
    const size_t taille = DNA.size();
    
    size_t m = 3;
    vector<string> v;
    
    /*
    struct DNA{
        const string dna;  // chaine saisie pour l'utilisateur
        size_t taille;  // Taille de la chaine
        string chaine;  // Chaine à chercher
    };
    */
    
    // what kind of structs can i create? for me it's stupid to make any struct in this program.
    
    bool checkDNA(string &s);
    string takeStrings(const string &s,size_t i, size_t m);
    void FindSequenceDNA(vector<string>&s,string sq);
    size_t incrementValue(size_t &m);
    
    
    
    int main(){
    
        string DNAuser;
        cout << "Introduce the DNA: ";
        cin >> DNAuser;
    
        bool request;
        cout << boolalpha;
        request = DNAuser.find_first_not_of("AGCT");
        cout << request << endl;
    
        vector<string> vectorSq;
        size_t auxiliar = 0;
        string r;
        size_t ocurrencies = DNA.size()-2;
        cout << "DNA: " << DNA << endl;
        while(auxiliar<ocurrencies){        // This gonna be works with the ocurriences, from 1 to end.
            r = takeStrings(DNA,auxiliar,auxiliar+m);
            auxiliar++;
            if(r.size()==m){
                vectorSq.push_back(r);
            }
        }
    
        // string res = takeStrings(DNA,0,3);
        // cout << "res: " << res << endl;
        // cout << "Printing vector: " << endl;
    
        // I just need to find the other, the practice is almost done.
    
        for(size_t i = 0; i< vectorSq.size(); i++){
            cout << vectorSq[i] << endl;
        }
    
        return 0;
    
    }
    
    
    string takeStrings(const string &s,size_t i, size_t m){
        string result;
        size_t aux=i;
        if(s.size()==0){
            cout << "String is empty." << endl;
        }
        else{
            for(;i<s.size()&&i!=m;i++){
                result+=s[i];
                aux++;
            }
    
        }
        return result;
    }
    
    void FindSequenceDNA(vector<string>&s,string sq){
        if(s.size()==0){
            cout << "DNA invalid." << endl;
        }
        else{
            for(size_t i=0;i<s.size();i++){
                if(sq==s[i]){
                    cout << "function: " << endl;
                    cout << s[i] << endl; // I need to calculate the real position in the string, not in the vector
                }
            }
        }
    
    }
    
    bool checkDNA(string &s){
        bool res;
        if(s.size()==0 || s.size()<3){
            cout << "DNA invalid" << endl;
        }
        else{
            for(size_t i=0;i<s.size();i++){
                if(s[i]=='A' || s[i]=='C' || s[i]=='G' || s[i]=='T')
                {
                    res = true;
                }
                else{
                    res= false;
                }
            }
        }
        return res;
    }
    
    size_t incrementValue(size_t &m){
        if(m<DNA.size()){
            m++;
        }
        return m;
    }
    

2 个答案:

答案 0 :(得分:1)

怎么样:

std::map< std::string, std::vectpr<int> > msvi;
std::size_t len = dna.size();
for(size_t from = 0; from < len; ++from) {
  for(size_t sz = 3; sz < len; ++sz) {
    msvi[ dna.substr(from, sz ].push_back(from);
  }
}

这将创建大小为3的所有字符串,并将其保存在地图中。

Live demo link

Print only the items with 2 or more instances


由于您不想使用std::map,您可以构建一个如C中所写的this page所示的特里结构。将树节点更改为:

struct tree_node {
  vector<int> starts;
  struct tree_node *children[26];  /* A to Z */
};

答案 1 :(得分:1)

基于Mohit的答案,但重新使用指针,可以获得更好的性能(vs string.substr)

#include <iostream>
#include <cstring>
#include <vector>
#include <string>

using namespace std;

static const char* DNAdata = "ACTGCGACGGTACGCTTCGACGTAG";
static const size_t len = strlen(DNAdata);

vector< vector< string > > uniqueKeys(len);
vector< vector< vector<size_t> > > locations(len);


void saveInfo(const char* str, size_t n, size_t loc) {
   vector<string>& keys = uniqueKeys[n-1];
   vector<vector<size_t> >& locs = locations[n-1];

   bool found = false;
   for (size_t i=0; i<keys.size(); ++i) {
      if (keys[i] == str) {
     locs[i].push_back(loc);
     found = true;
     break;
      }
   }
   if (!found) {
      vector<size_t> newcont;
      newcont.push_back(loc);
      keys.push_back(str);
      locs.push_back(newcont);
   }
}

void printInfo(const char* str) {
   cout << str << endl;
   size_t len = strlen(str);
   vector<string>& keys = uniqueKeys[len-1];
   vector<vector<size_t> >& locs = locations[len-1];
   for (size_t i=0; i<keys.size(); ++i) {
      if (keys[i] == str) {
     vector<size_t>& l = locs[i];
     vector<size_t>::iterator iter = l.begin();
     for (; iter != l.end(); ++iter) {
        cout << *iter << endl;
     }

     break;
      }
   }
}

int main() {
   char* DNA = new char[len+1];
   strcpy(DNA, DNAdata);
   char* end = DNA+len;
   char* start = DNA;
   for (size_t n =3; n<=len; ++n) {
      size_t loc = 0;
      char* p = start;   
      char* e = p+n;
      while (e <= end) {     
     char save = *e;
     *e = 0;
     saveInfo(p++, n, loc++);
     *e = save;
     ++e;
      }
   }
   delete[] DNA;

   printInfo("GTA");
   printInfo("ACTGCGACGGTACGCTTCGACGTA");

   return 0;
}

打印全部:

void printAll() {
   for (size_t n=3; n<=len; ++n) {
      cout << "--> " << n << " <--" << endl;
      vector<string>& keys = uniqueKeys[n-1];
      vector<vector<size_t> >& locs = locations[n-1];
      for (size_t i=0; i<keys.size(); ++i) {
     cout << keys[i] << endl;
     vector<size_t>& l = locs[i];
     vector<size_t>::iterator iter = l.begin();
     for (; iter != l.end(); ++iter) {
        cout << *iter << endl;
     }
      }
   }
}