我有一个DNA序列-file1(250M字符/字节),看起来像这样(FASTA格式):
$sequence-file1
TCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATAT
GATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATA
TCCTCCATATGATGTCAGTGTCCTCTGTATGACATCAATATCCTCCATAC
GATGCCCCTGTCCTTCATATGATGTCAGTGTCCTTTTGTGAGCACCAGTG
TCCTTTGTATGACATCAGTAGTCTCCCATGAATGTCACTGTCTTCCCATA
以及具有非连续位置的格式的序列position-file2:
$positions-file2
1
2
7
39
51
我需要在lists-file2中指定的位置从sequence-file1中提取字符并打印出“position character”以下awk程序:
$prog.file.awk
{
for (i=1;i<=length;i++)
if((i+(NR-1)*length)==x)
print x"\t"substr($0,i,1);exit
}
...当我通过xargs
传递“x”的位置时,前50行才会这样做:
xargs -I{i} awk -v x={i} -f prog.file.awk sequence-file1 < positions-file2
输出:
1 T
2 C
7 A
39 T
position-file2中任何高于50的数字都将被忽略。给定上述输入文件的我想要的输出是:
1 T
2 C
7 A
39 T
51 G
此外,我正在寻找一种经济的解决方案,因为对于250M字符文件,我有大约200M的位置可以匹配。
答案 0 :(得分:1)
更正数据后,以下内容将有效:
awk 'FNR==NR{p[$1]++;next} {for(x in p)print x,substr($0,x,1)}' pf2 sf1
目前每行只有50个字符,因此您无法打印第51个字符。它也不会搜索该行中的每个字符,它只是提取您指定的字符,因此它会更快。
<强>解释强>
FNR==NR
表示后面的花括号中的所有内容仅适用于文件pf2
的处理。在那里,我将位置保存在数组p[]
中,因此在读取位置文件后p [1] = 1,p [2] = 1,p [7] = 1,p [39] = 1和p [51] = 1。
第二组花括号中的代码仅适用于第二个文件sf1
。它遍历我们在p[]
中保存的所有位置,并使用substr()
提取当前记录中的所选字符。
答案 1 :(得分:1)
我知道标签说awk,但考虑到数据集的预期大小,awk感觉就像是错误的工具。我的C结果比预期的要长一些,但部分是因为我添加了代码来验证线路终端和线路长度。
[dennis@localhost dna]$ gcc -Wall reindex.c
[dennis@localhost dna]$ ./a.out sequence.dat position.dat
1 T
2 C
7 A
39 T
51 G
将sequence-file1示例文本复制到sequence.dat后,似乎已经工作了 position-file2到position.dat。
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>
void usage(int argc,char **argv);
int analyze(
FILE *fp
,long *pLineTextLen /**< OUT: Length of alpha text per line */
,long *pLineBinLen /**< OUT: Total length of line including lf */
);
int reindex(
FILE *seqFp /**< IN: file with sequence to reindex */
,FILE *posFp /**< IN: file with indexes to extract */
,long lineTextLen /**< IN: text to index per line */
,long lineBinLen /**< IN: characters including termintion */
);
int main( int argc, char **argv)
{
int errval;
FILE * seqFp=NULL;
FILE * posFp=NULL;
long lineTextLen;
long lineBinLen;
char *sequenceName=NULL;
int argIdx;
argIdx=1;
if(argIdx >= argc)
{
usage(argc,argv);
errval=-__LINE__;
goto exiterror;
}
seqFp = fopen(argv[argIdx],"r");
if(seqFp == NULL)
{
errval=errno;
fprintf(stderr,"Unable to open %s\n",argv[argIdx]);
goto exiterror;
}
sequenceName = argv[argIdx];
argIdx++;
if(argIdx >= argc)
{
usage(argc,argv);
errval=-__LINE__;
goto exiterror;
}
posFp = fopen(argv[argIdx],"r");
if(posFp == NULL)
{
errval=errno;
fprintf(stderr,"Unable to open %s\n",argv[argIdx]);
goto exiterror;
}
errval = analyze(seqFp,&lineTextLen,&lineBinLen);
if(errval)
{
fprintf(stderr,"Unable to estimate line length of %s\n"
,sequenceName);
errval=-__LINE__;
goto exiterror;
}
errval = reindex(seqFp,posFp,lineTextLen,lineBinLen);
if(errval)
{
fprintf(stderr,"Unable to reindex (errval=%i)\n"
,errval);
goto exiterror;
}
exiterror:
if(seqFp != NULL)
{
fclose(seqFp);
seqFp=NULL;
}
if(posFp != NULL)
{
fclose(posFp);
posFp=NULL;
}
return(errval);
}
void usage(int argc,char **argv)
{
(void)argc; /* yes I'm ignoring it atm */
fprintf(stderr,"%s {seqeuence-file} {position-file}\n"
,argv[0]);
return;
}
/*********************************************************************/
/** Analyze file to determine line lenth
*
* Analyze first few lines of file for identical length text
* lines consisting only of alpha text.
*
* return non-zero if lines not consistent or other error.
*********************************************************************/
int analyze(
FILE *fp
,long *pLineTextLen /**< OUT: Length of alpha text per line */
,long *pLineBinLen /**< OUT: Total length of line including lf */
)
{
int input;
int lineTextLen=0;
int lineBinLen=0;
int confirmCount=0;
int count=0;
enum
{
TEXT_READ=0,
TERM_READ=1
}
state= TEXT_READ;
do
{
input=fgetc(fp);
if(input != EOF)
{
if(isalpha(input))
{
if( state == TERM_READ)
{
state = TEXT_READ;
if(lineBinLen != 0 )
{
if( count != lineBinLen )
{
/* mismatch */
goto exiterror;
}
confirmCount++;
}else
{
lineBinLen=count;
}
count=0; /* start new line */
}
count++;
}
else if( ( input == '\r' )
|| (input == '\n')
|| isblank(input) )
{
if(state == TEXT_READ)
{
state = TERM_READ;
if(lineTextLen!=0)
{
if(lineTextLen != count )
{
/* mismatch */
goto exiterror;
}
confirmCount++;
}
else
{
lineTextLen=count;
}
}
count++;
}
}
}
while(input!=EOF
&& confirmCount<4); /* 2 text and 2 bin */
exiterror:
rewind(fp);
if( pLineTextLen )
{
*pLineTextLen = lineTextLen;
}
if( pLineBinLen )
{
*pLineBinLen = lineBinLen;
}
return(confirmCount<4); /* non-zero if not confirmed */
}
/**********************************************************************/
/** reindex sequence file to std out.
*
* Print char at specified character indexes in sequence file.
* Character indexes are one-based index of characters in
* seq file not including line terminations. Line length and
* termination are assumed to be consistent and specified by
* passed parameters.
*
* Indexes are read as text strings one per line from pos file.
*
* /return non-zero on error.
*********************************************************************/
int reindex(
FILE *seqFp /**< IN: file with sequence to reindex */
,FILE *posFp /**< IN: file with indexes to extract */
,long lineTextLen /**< IN: text to index per line */
,long lineBinLen /**< IN: characters including termintion */
)
{
int errval=0;
char buffer[80];
char *pInput=NULL;
long index;
long lines;
long seekPos;
int sequence;
do
{
pInput=fgets(buffer,sizeof(buffer),posFp);
if( (pInput != NULL)
&& ( !isalnum(pInput[0]) )) /* empty line */
{
pInput=NULL;
}
if(pInput != NULL)
{
index=strtol(pInput,NULL,0);
if(index==0)
{
errval=-__LINE__;
goto exiterror;
}
index--; /* switch to zero based index */
/* integer truncated division expected below */
lines=index/lineTextLen;
seekPos= ( ( lines * lineBinLen )
+ ( index - lines * lineTextLen ) );
fseek(seqFp,seekPos,SEEK_SET);
sequence=fgetc(seqFp);
if(sequence == EOF)
{
errval=-__LINE__;
goto exiterror;
}
fprintf(stdout,"%li\t%c\n"
,index+1 /* convert back to one based */
,sequence);
}
}while(pInput!=NULL);
exiterror:
return(errval);
}