使用位置文件和目标文件在特定位置提取序列

时间:2014-06-07 16:54:26

标签: awk extract sequence fasta

我有一个DNA序列-file1(250M字符/字节),看起来像这样(FASTA格式):

$sequence-file1
TCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATAT
GATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATA
TCCTCCATATGATGTCAGTGTCCTCTGTATGACATCAATATCCTCCATAC
GATGCCCCTGTCCTTCATATGATGTCAGTGTCCTTTTGTGAGCACCAGTG
TCCTTTGTATGACATCAGTAGTCTCCCATGAATGTCACTGTCTTCCCATA

以及具有非连续位置的格式的序列position-file2:

$positions-file2
1
2
7
39
51

我需要在lists-file2中指定的位置从sequence-file1中提取字符并打印出“position character”以下awk程序:

$prog.file.awk    
    {
        for (i=1;i<=length;i++) 
            if((i+(NR-1)*length)==x) 
                print x"\t"substr($0,i,1);exit 
    }

...当我通过xargs传递“x”的位置时,前50行才会这样做:    xargs -I{i} awk -v x={i} -f prog.file.awk sequence-file1 < positions-file2 输出:

1   T
2   C
7   A
39  T

position-file2中任何高于50的数字都将被忽略。给定上述输入文件的我想要的输出是:

1   T
2   C
7   A
39  T
51  G

此外,我正在寻找一种经济的解决方案,因为对于250M字符文件,我有大约200M的位置可以匹配。

2 个答案:

答案 0 :(得分:1)

更正数据后,以下内容将有效:

awk 'FNR==NR{p[$1]++;next} {for(x in p)print x,substr($0,x,1)}' pf2 sf1

目前每行只有50个字符,因此您无法打印第51个字符。它也不会搜索该行中的每个字符,它只是提取您指定的字符,因此它会更快。

<强>解释

FNR==NR表示后面的花括号中的所有内容仅适用于文件pf2的处理。在那里,我将位置保存在数组p[]中,因此在读取位置文件后p [1] = 1,p [2] = 1,p [7] = 1,p [39] = 1和p [51] = 1。

第二组花括号中的代码仅适用于第二个文件sf1。它遍历我们在p[]中保存的所有位置,并使用substr()提取当前记录中的所选字符。

答案 1 :(得分:1)

我知道标签说awk,但考虑到数据集的预期大小,awk感觉就像是错误的工具。我的C结果比预期的要长一些,但部分是因为我添加了代码来验证线路终端和线路长度。

[dennis@localhost dna]$ gcc -Wall reindex.c 
[dennis@localhost dna]$ ./a.out sequence.dat position.dat
1   T
2   C
7   A
39  T
51  G

将sequence-file1示例文本复制到sequence.dat后,似乎已经工作了 position-file2到position.dat。

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>


void usage(int argc,char **argv);
int analyze(
  FILE *fp 
  ,long *pLineTextLen   /**< OUT: Length of alpha text per line      */
  ,long *pLineBinLen    /**< OUT: Total length of line including lf  */
  );

int reindex(
  FILE *seqFp           /**< IN: file with sequence to reindex       */
  ,FILE *posFp          /**< IN: file with indexes to extract        */
  ,long lineTextLen     /**< IN: text to index per line              */
  ,long lineBinLen      /**< IN: characters including termintion     */
  );


int main( int argc, char **argv)
{
  int errval;
  FILE * seqFp=NULL;
  FILE * posFp=NULL;
  long   lineTextLen;
  long   lineBinLen;
  char  *sequenceName=NULL;
  int    argIdx;

  argIdx=1;

  if(argIdx >= argc)
  {
    usage(argc,argv);
    errval=-__LINE__;
    goto exiterror;
  }
  seqFp = fopen(argv[argIdx],"r");
  if(seqFp == NULL)
  {
    errval=errno;
    fprintf(stderr,"Unable to open %s\n",argv[argIdx]);
    goto exiterror;
  }
  sequenceName = argv[argIdx];
  argIdx++;
  if(argIdx >= argc)
  {
    usage(argc,argv);
    errval=-__LINE__;
    goto exiterror;
  }
  posFp = fopen(argv[argIdx],"r");
  if(posFp == NULL)
  {
    errval=errno;
    fprintf(stderr,"Unable to open %s\n",argv[argIdx]);
    goto exiterror;
  } 
  errval = analyze(seqFp,&lineTextLen,&lineBinLen);
  if(errval)
  {
    fprintf(stderr,"Unable to estimate line length of %s\n"
            ,sequenceName);
    errval=-__LINE__;
    goto exiterror;
  }
  errval = reindex(seqFp,posFp,lineTextLen,lineBinLen);
  if(errval)
  {
    fprintf(stderr,"Unable to reindex (errval=%i)\n"
            ,errval);
    goto exiterror;
  }


exiterror:
  if(seqFp != NULL)
  {
    fclose(seqFp);
    seqFp=NULL;
  }
  if(posFp != NULL)
  {
    fclose(posFp);
    posFp=NULL;
  }
  return(errval);

}


void usage(int argc,char **argv)
{
  (void)argc;  /* yes I'm ignoring it atm */

  fprintf(stderr,"%s {seqeuence-file} {position-file}\n"
          ,argv[0]);
  return;
}

/*********************************************************************/
/** Analyze file to determine line lenth
 * 
 * Analyze first few lines of file for identical length text
 * lines consisting only of alpha text.
 * 
 * return non-zero if lines not consistent or other error.
 *********************************************************************/
int analyze(
  FILE *fp 
  ,long *pLineTextLen   /**< OUT: Length of alpha text per line      */
  ,long *pLineBinLen    /**< OUT: Total length of line including lf  */
  )
{
  int input;
  int lineTextLen=0;
  int lineBinLen=0;
  int confirmCount=0;
  int count=0;
  enum
  {
    TEXT_READ=0,
    TERM_READ=1
  }
  state= TEXT_READ;

  do
  {
    input=fgetc(fp);
    if(input != EOF)
    {
      if(isalpha(input))
      {
        if( state == TERM_READ)
        {
          state = TEXT_READ;
          if(lineBinLen != 0 )
          {
            if( count != lineBinLen )
            {
              /* mismatch */
              goto exiterror;
            }
            confirmCount++;
          }else
          {
            lineBinLen=count;
          }
          count=0;  /* start new line */
        }
        count++;
      }
      else if( ( input == '\r' )
               || (input == '\n')
               || isblank(input) )
      {
        if(state == TEXT_READ)
        {
          state = TERM_READ;
          if(lineTextLen!=0)
          {
            if(lineTextLen  != count )
            {
              /* mismatch */
              goto exiterror;
            }
            confirmCount++;
          }
          else
          {
            lineTextLen=count;           
          }
        }
        count++;
      }
    }
  }
  while(input!=EOF 
        && confirmCount<4); /* 2 text and 2 bin */
exiterror:  
  rewind(fp);
  if( pLineTextLen )
  {
    *pLineTextLen = lineTextLen;
  }
  if( pLineBinLen )
  {
    *pLineBinLen = lineBinLen;
  }

  return(confirmCount<4);  /* non-zero if not confirmed */
}

/**********************************************************************/
/** reindex sequence file to std out.
 * 
 * Print char at specified character indexes in sequence file.
 * Character indexes are one-based index of characters in 
 * seq file not including line terminations.  Line length and 
 * termination are assumed to be consistent and specified by 
 * passed parameters.
 *
 * Indexes are read as text strings one per line from pos file.
 *
 * /return non-zero on error. 
 *********************************************************************/

int reindex(
  FILE *seqFp           /**< IN: file with sequence to reindex       */
  ,FILE *posFp          /**< IN: file with indexes to extract        */
  ,long lineTextLen     /**< IN: text to index per line              */
  ,long lineBinLen      /**< IN: characters including termintion     */
  )
{
  int  errval=0;
  char buffer[80];
  char *pInput=NULL;
  long  index;
  long  lines;
  long  seekPos;
  int   sequence;

  do
  {
    pInput=fgets(buffer,sizeof(buffer),posFp);
    if( (pInput != NULL)
        && ( !isalnum(pInput[0]) ))  /* empty line */
    {
      pInput=NULL;
    }

    if(pInput != NULL)
    {
      index=strtol(pInput,NULL,0);
      if(index==0)
      {
        errval=-__LINE__;
        goto exiterror;
      }
      index--;  /* switch to zero based index */
      /* integer truncated division expected below */
      lines=index/lineTextLen;
      seekPos= ( ( lines * lineBinLen ) 
                 + ( index - lines * lineTextLen ) );

      fseek(seqFp,seekPos,SEEK_SET);
      sequence=fgetc(seqFp);
      if(sequence == EOF)
      {
        errval=-__LINE__;
        goto exiterror;
      }
      fprintf(stdout,"%li\t%c\n"
              ,index+1 /* convert back to one based */
              ,sequence);
    }

  }while(pInput!=NULL);

exiterror:
  return(errval);

}