Question

我有一个DNA序列和相关ID的文件，我试图将偶数行（ID）保存到一个数组，将奇数行（序列）保存到另一个数组。然后我想要将所有序列相互比较以找到唯一序列。例如，Seq A是AGTCGAT而Seq B是TCG，Seq B不是唯一的。我想将唯一序列及其ID保存到输出文件中，并且ID序列不是唯一的，只将ID保存到输出文件并打印＆＃34;删除带ID的序列：＆＃34;到控制台。我已经做了很多，但我遇到了一些问题。我尝试打印出两个独立的数组，sequence []和headers []，但由于某种原因，它们只包含5个字符串中的两个（该文件有5个ID和5个标题）。然后信息不会按照它应该的方式打印到屏幕上。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(){

  int total_seq = 20000;
  char seq[900];
  char** headers;
  char** sequences;
  int sequence_size = 0;

  headers = malloc(total_seq * sizeof(char*));
  sequences = malloc(total_seq * sizeof(char*));

  int index;
  for(index = 0; index < total_seq; index++){
    headers[index] = malloc(900 * sizeof(char));
    sequences[index] = malloc(900 * sizeof(char));
  }

  FILE *dna_file;
  FILE *new_file;
  dna_file = fopen("inabc.fasta", "r");
  new_file = fopen("output.fasta", "w");

  if (dna_file == NULL){
    printf("Error");
    return 0;
  }

  int i = 0;
  int j = 0;
  while(fgets(seq, sizeof seq, dna_file)){
    if(i%2 == 0){
      strcpy(headers[i/2], seq);
      i++;
    }
    else{
      strcpy(sequences[i/2], seq);
      i++;
    }
  }


  fclose(dna_file);
  sequence_size = i/2;

  char* result;
  for(i=0; i < sequence_size; i++){
    for(j=0; j < sequence_size; j++){
      if(i==j){
        continue;
    }
      result = strstr(sequences[j], sequences[i]);
      if(result== NULL){
        fprintf(new_file,"%s", headers[i]);
        fprintf(new_file,"%s", sequences[i]);
      }
      else{
        printf("Deleting sequence with id: %s \n", headers[i]);
        printf(sequences[i]);
        fprintf(new_file,"%s", headers[i]);
      }
    }
  }

示例文件inabc.fasta很短，但我使用的实际文件很长，这就是我使用malloc的原因。任何帮助将不胜感激！

编辑：示例输入文件inabc.fasta：

cat inabc.fasta

> id1 header1
abcd
> id2 header2
deghj
> id3 header3
defghijkabcd
> id4 header4
abcd
> id5 header5
xcvbnnmlll

因此，对于此示例，序列1和4将不会保存到输出文件

Answer 1

此：

while( fgets(seq, sizeof seq, dna_file) ) {
    if( i % 2 == 0 ){
        strcpy(headers[i], seq);
        i++;
    }
    else {
        strcpy(sequences[i-1], seq);
        i++;
    }
}

将跳过数组中的每个其他元素：

当i == 0时，它会存储在headers[0]
当i == 1时，它会存储在sequences[0]
当i == 2时，它会存储在headers[2]
当i == 3时，它会存储在sequences[2]

等等。

然后你做：

sequence_size = i/2;

因此，如果您循环sequence_size次，您只会在已写入的数组中间进行一半，并且您打印的所有其他元素都将未初始化。这就是为什么你只打印了一半的元素（如果你有5个元素，那么i / 2 == 2，你只会看到2），以及它为什么不是＃t; t按照它应该的方式打印到屏幕上。

当您在输入中读取时，您最好只使用两个单独的计数器，而单独使用一个变量来存储您是否在奇数或偶数输入线上。

例如：

int i = 0, j = 0, even = 1;
while( fgets(seq, sizeof seq, dna_file) ) {
    if( even ){
        strcpy(headers[i++], seq);
        even = 0;
    }
    else {
        strcpy(sequences[j++], seq);
        even = 1;
    }
}

这里最好有两个变量，因为如果读入奇数行，你的两个数组将包含不同数量的读取元素。

Answer 2

除了其他注释之外，您需要更正输出例程中的一些逻辑错误。下面，我已将您的代码留在评论中，因此您可以按照所做的更改和添加进行操作。

有几种方法可以更有效地处理内存管理，并提供一种方法来干净地迭代数据，而无需在整个代码中跟踪计数器。具体来说，当您分配pointers-to-pointer-to-char数组时，请使用calloc而不是malloc，以便将指针初始化为zero/NULL。这使您可以轻松地仅迭代已分配的那些指针。

在阅读数据之前，无需分配20000 900字符数组（times 2）。分配你的指针（或者根据需要从少量指针开始说256和realloc），然后只为headers和sequences 中的每个元素分配为在你的阅读循环中需要。此外，每次向1800和900 * 2添加元素时，不要分配headers个字符（sequences），而只需分配保存数据所需的内存。这可以产生巨大的差异。例如，在开始阅读这一小组样本数据之前，您需要分配20000 * 900 * 2 = 36000000 bytes (36M)。即使分配所有20000指针，根据需要为此示例数据分配内存，也会将内存使用量限制为321,246 bytes（少于1%的{{1}}

写循环中的逻辑不起作用。您必须将数据写入内部循环之外。否则，您无法测试是否删除重复条目。进一步测试36M没有提供跳过重复项的方法。 result随内循环的每次迭代而变化。您需要同时测试并设置标记，以控制是否在离开内循环后删除副本。

最后，由于您是动态分配内存，因此您负责跟踪分配的内存并在不再需要时释放内存。使用result分配指针数组可以快速释放使用中的内存。

查看我对您的代码所做的更改和添加。了解更改，如果您有任何疑问，请与我们联系。 注意：为了不使代码混乱，省略了许多检查。您应该至少确保不超过在完整数据集上运行时分配的calloc指针，并根据需要20000。您还应检查realloc是否成功（它正在分配内存），尽管您可以确保比较strdup和headers索引计数。我相信还有更多有意义的东西。祝好运。

sequences

<强>输出：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAXSEQ 20000
#define SZSEQ 900

int main ()
{

    int total_seq = MAXSEQ;     /* initialize all variables */
    char seq[SZSEQ] = {0};
    char **headers = NULL;      /* traditionally variables  */
    char **sequences = NULL;    /* declared at beginning    */
    // char *result = NULL;
    // int sequence_size = 0;
    size_t len = 0;
    int hidx = 0;
    int sidx = 0;
    // int idx = 0;      /* (see alternative in fgets loop) */
    int i = 0;
    int j = 0;
    int del = 0;

    /* calloc initilizes to 0 & allows iteration on addresses */
    headers = calloc (total_seq, sizeof (*headers));
    sequences = calloc (total_seq, sizeof (*sequences));

    /* allocate as needed if possible - see read loop  */
//     for (index = 0; index < total_seq; index++) {
//         headers[index] = malloc (900 * sizeof (char));
//         sequences[index] = malloc (900 * sizeof (char));
//     }

    FILE *dna_file = NULL;
    FILE *new_file = NULL;
    dna_file = fopen ("inabc.fasta", "r");
    new_file = fopen ("output.fasta", "w+");    /* create if not existing "w+"  */

    if (!dna_file || !new_file) {
        fprintf (stderr, "Error: file open failed.\n");
        return 1;                               /* 1 indicates error condition  */
    }

    while (fgets (seq, sizeof (seq), dna_file)) /* read dna_file & separate     */
    {
        len = strlen (seq);                     /* strip newline from seq end   */
        if (seq[len-1] == '\n')                 /* it's never good to leave \n  */
            seq[--len] = 0;                     /* scattered through your data  */

        /* if header line has '>' as first char -- use it!  */
        if (*seq == '>')
            headers[hidx++] = strdup (seq);     /* strdup allocates             */
        else
            sequences[sidx++] = strdup (seq);

        /* alternative using counter if no '>'  */
//         if (idx % 2 == 0)
//             headers[hidx++] = strdup (seq);
//         else
//             sequences[sidx++] = strdup (seq);
//         idx++
    }

    fclose (dna_file);

    if (hidx != sidx)
        fprintf (stderr, "warning: hidx:sidx (%d:%d) differ.\n", hidx, sidx);

//     sequence_size = (hidx>sidx) ? sidx : hidx;  /* protect against unequal read */
//     
//     for (i = 0; i < sequence_size; i++) {
//         for (j = 0; i < sequence_size; i++) {
//             if (i == j) {
//                 continue;
//             }
//             result = strstr (sequences[j], sequences[i]);
//             if (result == NULL) {
//                 fprintf (new_file, "%s", headers[i]);
//                 fprintf (new_file, "%s", sequences[i]);
//             } else {
//                 printf ("Deleting sequence with id: %s \n", headers[i]);
//                 printf (sequences[i]);
//                 fprintf (new_file, "%s", headers[i]);
//             }
//         }
//     }

    /* by using calloc, all pointers except those assigned are NULL */
    while (sequences[i])    /* testing while (sequences[i] != NULL) */
    {
        j = 0;
        del = 0;
        while (sequences[j])
        {
            if (i == j)
            {
                j++;
                continue;
            }

            if (strstr (sequences[j], sequences[i]))    /* set delete flag  */
            {
                del = 1;
                break;
            }
            j++;
        }

        if (del) {
            printf ("Deleting id: '%s' with seq: '%s' \n", headers[i], sequences[i]);
            // printf (sequences[i]);
            fprintf (new_file, "%s\n", headers[i]);
        } else {
            fprintf (new_file, "%s\n", headers[i]);
            fprintf (new_file, "%s\n", sequences[i]);
        }
        i++;
    }

    fclose (new_file);

    /* free allocated memory - same simple iteration */
    i = 0;
    while (headers[i])
        free (headers[i++]);      /* free strings allocated by strdup */
    if (headers) free (headers);  /* free the array of pointers       */

    i = 0;
    while (sequences[i])
        free (sequences[i++]);
    if (sequences) free (sequences);

    return 0;
}

<强> output.fasta：

$ ./bin/dnaio
Deleting id: '> id1 header1' with seq: 'abcd'
Deleting id: '> id4 header4' with seq: 'abcd'

内存分配/免费验证：

$ cat output.fasta
> id1 header1
> id2 header2
deghj
> id3 header3
defghijkabcd
> id4 header4
> id5 header5
xcvbnnmlll

C - 将一个文件保存到两个单独的阵列并打印每个元素

2 个答案: