给出带有八列的制表符分隔文件:
22 51244237 rs575160859 C T 100 PASS AC=19;AF=0.00379393;AN=5008;NS=2504;DP=13345;EAS_AF=0;AMR_AF=0.0043;AFR_AF=0;EUR_AF=0.0099;SAS_AF=0.0061;AA=.|||;VT=SNP
如何使用bash从第八列中的信息创建新的制表符分隔文件:AF; EAS_AF; AMR_AF; AFR_AF; EUR_AF; SAS_AF和相应的数值?
即:
#AF EAS_AF AMR_AF AFR_AF EUR_AF SAS_AF
0.00379393 0 0.0043 0 0.0099 0.0061
我知道我可以将“ eigth”列除以“;” (https://unix.stackexchange.com/questions/156919/splitting-a-column-using-awk),然后删除不需要的文本列和文本字符串(即“ AF =“),但是有没有更有效的方法?
谢谢
答案 0 :(得分:0)
请您尝试以下。
awk '
{
match($0,/AF[^;]*/)
af=substr($0,RSTART,RLENGTH)
match($0,/EAS_AF[^;]*/)
eas=substr($0,RSTART,RLENGTH)
match($0,/AMR_AF[^;]*/)
amr=substr($0,RSTART,RLENGTH)
match($0,/AFR_AF[^;]*/)
afr=substr($0,RSTART,RLENGTH)
match($0,/EUR_AF[^;]*/)
eur=substr($0,RSTART,RLENGTH)
match($0,/SAS_AF[^;]*/)
sas=substr($0,RSTART,RLENGTH)
VAL=af OFS ac OFS eas OFS amr OFS afr OFS eur OFS sas
split(VAL,array,"[= ]")
print array[1],array[4],array[6],array[8],array[10],array[12] ORS array[2],array[5],array[7],array[9],array[11],array[13]
}' Input_file | column -t
说明: 也在此处添加了上述代码的说明。
awk '
{
match($0,/AF[^;]*/) ##Using match out of the box awk function for matching AF string till semi colon.
af=substr($0,RSTART,RLENGTH) ##creating variable named af whose value is substring of indexes of RSTART to till value of RLENGTH.
match($0,/EAS_AF[^;]*/) ##Using match out of the box awk function for matching EAS_AF string till semi colon.
eas=substr($0,RSTART,RLENGTH) ##creating variable named eas whose value is substring of indexes of RSTART to till value of RLENGTH.
match($0,/AMR_AF[^;]*/) ##Using match out of the box awk function for matching AMR_AF string till semi colon.
amr=substr($0,RSTART,RLENGTH) ##creating variable named amr whose value is substring of indexes of RSTART to till value of RLENGTH.
match($0,/AFR_AF[^;]*/) ##Using match out of the box awk function for matching AFR_AF string till semi colon.
afr=substr($0,RSTART,RLENGTH) ##creating variable named afr whose value is substring of indexes of RSTART to till value of RLENGTH.
match($0,/EUR_AF[^;]*/) ##Using match out of the box awk function for matching EUR_AF string till semi colon.
eur=substr($0,RSTART,RLENGTH) ##creating variable named eur whose value is substring of indexes of RSTART to till value of RLENGTH.
match($0,/SAS_AF[^;]*/) ##Using match out of the box awk function for matching SAS_AF string till semi colon.
sas=substr($0,RSTART,RLENGTH) ##creating variable named sas whose value is substring of indexes of RSTART to till value of RLENGTH.
VAL=af OFS ac OFS eas OFS amr OFS afr OFS eur OFS sas ##Creating variable VAL whose value is values of all above mentioned variables.
split(VAL,array,"[= ]") ##Using split function of awk to split it into array named array with delimiter space OR =.
print array[1],array[4],array[6],array[8],array[10],array[12] ORS array[2],array[5],array[7],array[9],array[11],array[13] ##Printing all array values as per OP.
af=ac=eas=amr=afr=eur=sas="" ##Nullifying all variables mentioned above.
}' Input_file | column -t ##Mentioning Input_file name here and passing awk output to column command to take output in TAB format.
答案 1 :(得分:0)
以“;”分隔列
awk -F";" '$1=$1' OFS="\t" file.temp > tmp && mv tmp file.temp
删除不需要的列(新标题:CHROM POS ID REF ALT QUAL FILTER AC AF EAS_AF AMR_AF AFR_AF EUR_AF SAS_AF)
awk '{print $1, $2, $3, $4, $5, $6, $7, $8, $9, $13, $14, $15, $16, $17}' file.temp > tmp && mv tmp file.temp
删除不需要的字符串
awk '{ gsub("SAS_AF=", "", $14); print }' file.temp > tmp && mv tmp file.temp
awk '{ gsub("EUR_AF=", "", $13); print }' file.temp > tmp && mv tmp file.temp
awk '{ gsub("AFR_AF=", "", $12); print }' file.temp > tmp && mv tmp file.temp
awk '{ gsub("AMR_AF=", "", $11); print }' file.temp > tmp && mv tmp file.temp
awk '{ gsub("EAS_AF=", "", $10); print }' file.temp > tmp && mv tmp file.temp
awk '{ gsub("AF=", "", $9); print }' file.temp > tmp && mv tmp file.temp
awk '{ gsub("AC=", "", $8); print }' file.temp > tmp && mv tmp file.temp
答案 2 :(得分:0)
这是真正完成此任务的方法:
$ cat tst.awk
BEGIN {
FS=OFS="\t"
numFlds = split("AF EAS_AF AMR_AF AFR_AF EUR_AF SAS_AF",fldNames,/ /)
printf "#"
for (i=1; i<=numFlds; i++) {
printf "%s%s", fldNames[i], (i<numFlds ? OFS : ORS)
}
}
{
nf = split($8,tmp,/[;=]/)
for (i=1; i<nf; i+=2) {
fldName = tmp[i]
fldVal = tmp[i+1]
name2val[fldName] = fldVal
}
for (i=1; i<=numFlds; i++) {
fldName = fldNames[i]
fldVal = name2val[fldName]
printf "%s%s", fldVal, (i<numFlds ? OFS : ORS)
}
}
$ awk -f tst.awk file
#AF EAS_AF AMR_AF AFR_AF EUR_AF SAS_AF
0.00379393 0 0.0043 0 0.0099 0.0061
输出中的对齐方式只是看起来很简单,因为它是按需制表符分隔的。