搜索数组以查找字符序列的C程序

时间:2015-10-29 03:16:21

标签: c arrays function for-loop do-while

我只是C编程的初学者。请帮我解决以下问题。

问题:搜索包含一系列字符的给定数组的程序。这些字符限制为字母A,G,T或C.序列中的最后一个字符设置为代码0,以便轻松检测到结束。

在这里找不到我做错了什么,但一直都是错误。

/*A program that searches through a given array that contains a sequence of characters. These characters are restricted 
to be the letters A, G, T, or C. The last character in the sequence is set to be the code 0, so that the end is easily
detected. That array should be declared and initialized.*/

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
void input_sequence(int length,char input[]);
void search(char C[],char DNA[],int length);

int main(void) {
    //Given array
    char DNA[] = {'A', 'G', 'C', 'G', 'G', 'G', 'A', 'C', 'C', 'G', 'T', 'C', 
          'C', 'C', 'G', 'A', 'C', 'A', 'T', 'T', 'G', 'A', 'T', 'G', 
          'A', 'A', 'G', 'G', 'G', 'T', 'C', 'A', 'T', 'A', 'G', 'A', 
          'C', 'C', 'C', 'A', 'A', 'T', 'A', 'C', 'G', 'C', 'C', 'A', 
          'C', 'C', 'A', 'C', 'C', 'C', 'C', 'A', 'A', 'G', 'T', 'T', 
          'T', 'T', 'C', 'C', 'T', 'G', 'T', 'G', 'T', 'C', 'T', 'T', 
          'C', 'C', 'A', 'T', 'T', 'G', 'A', 'G', 'T', 'A', 'G', 'A', 
          'T', 'T', 'G', 'A', 'C', 'A', 'C', 'T', 'C', 'C', 'C', 'A', 
          'G', 'A', 'T', 'G', '\0'};
    int length,i=0,k;
    /*Program should repeatedly ask the user for two things: the length of a search sequence,
    and the search sequence itself*/
    /*The program should terminate when the length of the input sequence is zero or less*/
    do{
        printf("Enter length of DNA sequence to match: ");
        scanf("%d",&length);
        Search sequence array
        char input[length];
        //input sequence length has to be >0
        if(length>0){
            input_sequence(length,input[]);
            /*The elements of the search sequence may take on one of five characters: A,G,T,C and *. The
            meaning of the ‘*’ character is that it matches all four nucleotides: A,G,T and C.*/
            for(i=0; i<length; i++){
                k=0;
                if(input[i]!='A'&&input[i]!='G'&&input[i]!='T'&&input[i]!='C'&&input[i]!='*'){
                    printf("Erroneous character input ’%c’ exiting\n",input[i]);
                    k=1;
                }
                if(k==1)
                    break;             
            }
            if(k==0){
                search(input,DNA,length);
            }
            k=0;
        }
    }
    while(length>0);
    printf("Goodbye");

    return (EXIT_SUCCESS);
}

//Function to search for input sequence in the given array
void search(char C[],char DNA[],int length){
    int numFound = 0,i,foundIndex;
    bool found = false;
    for(i=0;i<length && !found;i++) {
        int n=0;
        char temp=C[i];
        if (temp==DNA[i]) {
            numFound++;
            if (numFound == length) {
                found = true;
                foundIndex = i - (length-1);
            }
        }
        else numFound = 0;
    }
    if (found)
        printf("Match of search sequence found at element %d\n",foundIndex);   
}

void input_sequence(int length,char input[]){
    int i;
    printf("Enter %d characters (one of AGTC*) as a search sequence: ",length);
    for(i=0; i<length; i++){
        scanf(" %c", &input[i]);
        }
}

3 个答案:

答案 0 :(得分:3)

此处示例使用GNU C library regexp

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>

void search(const char *regexp_str, const char *DNA, int length)
{
    int reti;
    const char *p = DNA;
    const int n_matches = 5;
    regmatch_t m[n_matches];
    regex_t regex;
    (void)length;

    reti = regcomp(&regex, regexp_str, 0);
    if(reti) {
        printf("Could not compile regex: %s\n", regexp_str);
        return;
    }

    while(1) {//based on http://www.lemoda.net/c/unix-regex/
        int nomatch = regexec(&regex, p, n_matches, m, 0);
        if(nomatch) {
            printf ("No more matches.\n");
            return;
        }
        if(m[0].rm_so != -1) {
            int start = m[0].rm_so + (p - DNA);
            int finish = m[0].rm_eo + (p - DNA);
            printf("'%.*s' (bytes %d:%d)\n",
                    m[0].rm_eo - m[0].rm_so, m[0].rm_so + p,
                    start, finish);
        }
        p += m[0].rm_eo;
    }
    regfree(&regex);
}

int main(void) {
    const char *DNA = "AGCGGGACCGTCCCGACATTGATGAAGGGTCATAGACCCA"
                      "ATACGCCACCACCCCAAGTTTTCCTGTGTCTTCCATTGAG"
                      "TAGATTGACACTCCCAGATG";
    while(1) {
        int length;
        char input[256];

        printf("Enter length of DNA sequence to match: ");
        fgets(input, sizeof(input), stdin);
        length = strtol(input, NULL, 10);
        if(length <= 0) {//input sequence length has to be >0
            break;
        } else if(length >= (int)(sizeof(input) - 1)) {
            printf("ERROR: Too big length=%d, max supported length=%d\n",
                   length, sizeof(input) - 1);
            break;
        }

        while(1) {
            const char *validInputs = "AGTC*";
            printf("Enter %d characters (one of AGTC*) as a search sequence: ",length);
            fgets(input, sizeof(input), stdin);

            int valid = 1;
            for(int i = 0; i < length; i++) {
                if(strchr(validInputs, input[i]) == NULL) {
                  printf("Erroneous character input '%c' in '%s'\n", input[i], input);
                  valid = 0;
                  break;
                }
            }
            if(valid) {
                break;
            }
        }
        input[length] = 0;
        //substitute '*' on '.' for using in regexp
        char *ptr = input;
        while((ptr = strchr(ptr, '*')) != NULL) {
            *ptr = '.';
        };
        printf("search for: %s\n", input);
        search(input, DNA, length);
    }
    printf("Goodbye\n");
    return (EXIT_SUCCESS);
}

另外使用C ++ 11 std::regex(仅需要更改search()):

#include <regex>
#include <iterator>

void search(const char *C, const char *DNA, int )
{
    std::regex regex(C);
    std::string str(DNA);
    auto words_begin = std::sregex_iterator(str.begin(), str.end(), regex);
    auto words_end = std::sregex_iterator();
    printf("Found %d matches:\n", std::distance(words_begin, words_end));
    for(std::sregex_iterator i = words_begin; i != words_end; ++i) {
        std::smatch match = *i;
        printf(" match: %s, pos=%d\n", match.str().c_str(), match.position());
    }
}

答案 1 :(得分:1)

在您的主要功能中,此行是一个问题:

search(input[],DNA[],length);

参数1和2,输入[]和DNA []不正确。该表示法用于声明和初始化数组。调用这些数组时,除非需要该数组中的特定元素,否则应将括号括起来。 尝试将其重写为:

search(input, DNA, length);

此外,你在do while循环结束时缺少一个结束大括号。

答案 2 :(得分:-1)

基本思路是扫描数组,比较字符,直到找到所有匹配项。要实现你可以有两个指针,一个最初指向DNA数组,另一个指向你的目标数组。然后你比较两个字符,如果它们匹配则将指针向前移动一步。如果匹配失败,请将目标数组指针重置为第一个char并将DNA指针向前移动一步。重启这些程序直到完全匹配。 您可以查看一个非常有效的算法Boyer–Moore string search algorithm

如果您不想自己实现算法,则有一个简单的内置函数strstr()。您将两个数组传递给它,它将返回第一个出现位置。