在第二列中,我有SYNONYMOUS_CODING
和NON_SYNONYMOUS_CODING
(第3列)突变的频率(第1列)。
我需要计算每个基因的dN/dS
比(NON_SYNONYMOUS_CODING / SYNONYMOUS_CODING
)。
并非所有基因都可能同时具有SYNONYMOUS_CODING
和NON_SYNONYMOUS_CODING
的频率
0.00491398 A1BG SYNONYMOUS_CODING
0.923601 A1BG NON_SYNONYMOUS_CODING
0.051361 A1CF NON_SYNONYMOUS_CODING
0.153161 A1CF SYNONYMOUS_CODING
0.0977385 A2M SYNONYMOUS_CODING
1.36114 A2M NON_SYNONYMOUS_CODING
2.19662 A2ML1 SYNONYMOUS_CODING
3.43866 A2ML1 NON_SYNONYMOUS_CODING
预期结果如下:
187.95 A1BG
0.3353 A1CF
13.926 A2M
1.565 A2ML1
答案 0 :(得分:1)
这是一个小的awk脚本:
cat script.awk
NR%2 { # process odd numbered lines
readVars(); # read variables from line
next; # skip processing, goto next line (even numbered line)
}
{ # process even numbered lines
readVars(); # read variables from line
print (nonSyn/syn), $2; # print variables division and print code
syn = nonSyn = 0; # reset variables to 0
}
function readVars() {
if ($3 ~ "NON_SYNONYMOUS_CODING") # if 3rd field match non_syn
nonSyn = $1; # set nonSyn value to 1st field
else syn = $1; # otherwize set syn value to 1st field
}
〜运行:
awk -f script.awk input.txt
输出:
187.954 A1BG
2.98205 A1CF
13.9263 A2M
1.56543 A2ML1
答案 1 :(得分:1)
假设:
sort -k2 genes | awk -f dNdSCompute.awk
并不是所有的基因都可能同时具有
SYNONYMOUS_CODING
和NON_SYNONYMOUS_CODING
的频率=>在这种情况下,由于无法计算dN/dS
的比率,它们将被忽略。
代码:
$ cat dNdSCompute.awk
{
#assign the first column value to syn or nonSyn depending on the third column value
if ($3 == "NON_SYNONYMOUS_CODING")
nonSyn = $1
else syn = $1
#if the current gene is the same as the previous one
#print the result and reset the frequencie
if ( $2 == gene){
print (nonSyn/syn), $2
syn = nonSyn = 0
}
#current gene name is saved in gene variable and will be used at next line
gene = $2
}
输入:
( 具有两个频率都不相同的基因 )
$ cat genes
0.00491398 A1BG SYNONYMOUS_CODING
0.923601 A1BG NON_SYNONYMOUS_CODING
0.051361 A1CF NON_SYNONYMOUS_CODING
0.153161 A1CF SYNONYMOUS_CODING
0.111161 A2CF SYNONYMOUS_CODING
0.0977385 A2M SYNONYMOUS_CODING
1.36114 A2M NON_SYNONYMOUS_CODING
1.76174 A3R NON_SYNONYMOUS_CODING
2.19662 A2ML1 SYNONYMOUS_CODING
3.43866 A2ML1 NON_SYNONYMOUS_CODING
输出:
$ awk -f dNdSCompute.awk genes
187.954 A1BG
0.33534 A1CF
13.9263 A2M
1.56543 A2ML1
答案 2 :(得分:0)
使用GNU awk和indirect function calls(将$3
的值用作要调用的函数的名称):
$ awk '
function NON_SYNONYMOUS_CODING(n,s) { # notice the parameter order here...
return n/s
}
function SYNONYMOUS_CODING(s,n) { # and here
return n/s
}
{
fun=$3 # get the function name from $3
if($2 in a) { # if other $2 has already been seen
print $2,@fun($1,a[$2]) # divide in the function and output
delete a[$2] # saving memory
} else # if this $2 is the first
a[$2]=$1 # hash it
}' file
输出:
A1BG 187.954
A1CF 0.33534
A2M 13.9263
A2ML1 1.56543