awk根据文件中的正则表达式匹配打印特定字段

时间:2016-04-30 15:08:58

标签: regex awk

我正在尝试使用awk查找input中的关键字和找到的打印指定字段。下面的awk确实运行但不会产生所需的输出。应该发生的是,如果在行中找到TYPE=ins or TYPE=del,则打印$1,$2,$4,$5, and the LEN=LEN=也是该行中的字段,其中=后面有一个数字。谢谢你:)。

输入

chr1    1647893 .   C   CTTTCTT 31.9545 PASS    AF=0.330827;AO=179;DP=695;FAO=132;FDP=399;FR=.;FRO=267;FSAF=67;FSAR=65;FSRF=124;FSRR=143;FWDB=0.0145873;FXX=0.00249994;HRUN=1;LEN=6;MLLD=190.481;OALT=TTTCTT;OID=.;OMAPALT=CTTTCTT;OPOS=1647894;OREF=-;PB=0.5;PBP=1;QD=0.320346;RBI=0.0146526;REFB=-0.0116875;REVB=0.00138131;RO=471;SAF=85;SAR=94;SRF=236;SRR=235;SSEN=0;SSEP=0;SSSB=-0.0324817;STB=0.528856;STBP=0.43;TYPE=ins;VARB=0.0222858 GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR   0/1:31:695:399:471:267:179:132:0.330827:94:85:236:235:65:67:124:143
chr1    1650787 .   T   C   483.012 PASS    AF=0.39;AO=181;DP=459;FAO=156;FDP=400;FR=.;FRO=244;FSAF=100;FSAR=56;FSRF=162;FSRR=82;FWDB=-0.00931067;FXX=0;HRUN=1;LEN=1;MLLD=210.04;OALT=C;OID=.;OMAPALT=C;OPOS=1650787;OREF=T;PB=0.5;PBP=1;QD=4.83012;RBI=0.018986;REFB=-0.0114993;REVB=-0.0165463;RO=276;SAF=116;SAR=65;SRF=184;SRR=92;SSEN=0;SSEP=0;SSSB=-0.0305478;STB=0.515311;STBP=0.652;TYPE=snp;VARB=0.019956  GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR   0/1:483:459:400:276:244:181:156:0.39:65:116:184:92:56:100:162:82
chr1    17034455    .   CGCGCGCGT   C   50  PASS    AF=0.205882;AO=56;DP=272;FR=.;LEN=8;OALT=-;OID=.;OMAPALT=C;OPOS=17034456;OREF=GCGCGCGT;RO=216;SAF=27;SAR=29;SRF=112;SRR=104;TYPE=del    GT:GQ:DP:RO:AO:SAF:SAR:SRF:SRR:AF   0/1:99:272:216:56:27:29:112:104:0.205882

AWK

awk '/TYPE=ins/ {print $1,$2,$4,$5, "/TYPE=*/" "/LEN=*/" $0;next} /TYPE=del/ {print $1,$2,$4,$5, "/TYPE=*/" "/LEN=*/" $0;next} 1' input > out

所需的输出

chr1    1647893 C   CTTTCTT TYPE=ins LEN=6
chr1    17034455 CGCGCGCGT  C TYPE=del LEN=8

2 个答案:

答案 0 :(得分:1)

这是一个awk解决方案:

awk '$0~"TYPE=del" || $0~"TYPE=ins"{max=split($0,ar,";")
                          len=""
                          type=""
                          for(i=1; i<=max; i++){
                              if(ar[i]~"LEN="){len=ar[i]}
                              if(ar[i]~"TYPE="){type=ar[i]}
                          }
                          print $1,$2,$4,$5,type,len}' input

输出:

chr1 1647893 C CTTTCTT TYPE=ins LEN=6
chr1 17034455 CGCGCGCGT C TYPE=del LEN=8

答案 1 :(得分:1)

您可以使用此awk命令:

awk 'function find(str) {
   return substr($0, match($0, str "=[^; \t]+"), RLENGTH);
}
/TYPE=(ins|del)/ {
   print $1, $2, $4, $5, find("TYPE"), find("LEN")
}' file

<强>输出:

chr1 1647893 C CTTTCTT TYPE=ins LEN=6
chr1 17034455 CGCGCGCGT C TYPE=del LEN=8