Question

我的数据如下

0.5,4.96,0.724973,0.01481065
0.5,5.11,0.726749,0.01140151
0.5,4.99,0.893074,0.00910343
0.5,4.14,0.734336,0.00835252
0.5,1.69,0.755600,0.00422898
0.6,4.43,0.733582,0.01796329
0.6,4.47,0.740393,0.01399680
0.6,4.49,0.885607,0.01095668
0.6,3.69,0.720035,0.00992851
0.6,1.60,0.748339,0.00456993
0.7,4.03,0.756354,0.02086922
0.7,3.99,0.771689,0.01705783
0.7,4.02,0.854532,0.01319982
0.7,3.33,0.725414,0.01170297

我想根据第一列的值计算第二，第三和第四列的平均值。

例如0.5

0.5,4.18,0.766946,0.00957942

Answer 1

GNU datamash的最短解决方案：

datamash -st, -g1 mean 2 mean 3 mean 4 <file

-s - 排序记录
-t, - 将逗号,设置为字段分隔符
-g1 - 第1场的小组记录

输出：

0.5,4.178,0.7669464,0.009579418
0.6,3.736,0.7655912,0.011483042
0.7,3.8425,0.77699725,0.01570746

Answer 2

要保存的awk（考虑到你的Input_file是排序模式，如果没有，那么你也可以在跟随代码之前使用sort -t, -k1 | awk ...）：

awk -F, 'prev && prev != $1{for(i in a){split(i, b," ");val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);};delete a;print b[1],val[b[1]]}{a[$1,2]+=$2;a[$1,3]+=$3;a[$1,4]+=$4;c[$1,2]++;c[$1,3]++;c[$1,4]++;prev=$1} END{for(i in a){split(i, b," ");val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);};delete a;print b[1],val[b[1]]}' SUBSEP=" "   Input_file

输出如下。

0.5 4.18,0.77,0.01
0.6 3.74,0.77,0.01
0.7 3.84,0.78,0.02

现在也添加非单线形式的解决方案。

awk -F, '
prev && prev != $1{
  for(i in a){
    split(i, b," ");
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);
};
delete a;
print b[1],val[b[1]]
}
{
a[$1,2]+=$2;
a[$1,3]+=$3;
a[$1,4]+=$4;
c[$1,2]++;
c[$1,3]++;
c[$1,4]++;
prev=$1
}
END{
for(i in a){
  split(i, b," ");
  val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);
};
delete a;
print b[1],val[b[1]]
}
' SUBSEP=" "    Input_file

编辑： 现在也为命令添加说明。

awk -F, '
##making field seprator as comma(,)
prev && prev != $1{
##Checking here if value of prev variable is NOT equal to first column and value of variable prev is NOT NULL.
  for(i in a){
##Traversing in array named a now.
    split(i, b," ");
##using split utility of awk which will split any variable or line to an array with provided delimiter eg--> split(variable/line, array_name,delimiter), like i(index of array a) is provided here to be splited into array named b with delimiter as a space.
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);
##creating an array named val with index of array b value whose value will be the AVG/MEAN of all $1s and its index will be $1. It will concatenate its own value.
};
delete a;
##Deleting array a here.
print b[1],val[b[1]]
##printing array b whose index is 1 and array val whose index is value of b[1] array.
}
{
a[$1,2]+=$2;
##creating array a whose index is $1,2 where 2 denoted the 2nd field and it will add its all $2 values of whole Input_file.
a[$1,3]+=$3;
##creating array a whose index is $1,3 where 3 denoted the 3rd field and it will add its all $3 values of whole Input_file.
a[$1,4]+=$4;
##creating array a whose index is $1,4 where 4 denoted the 4th field and it will add its all $4 values of whole Input_file.
c[$1,2]++;
##creating array named c with index of $1,2 and incrementing its value each time to make sure no empty column values will come.
c[$1,3]++;
##creating array named c with index of $1,3 and incrementing its value each time to make sure no empty column values will come.
c[$1,4]++;
##creating array named c with index of $1,4 and incrementing its value each time to make sure no empty column values will come.
prev=$1
##Assigning variable prev value as column 1.
}
END{
for(i in a){
##Again traversing through the array a and getting the MEAN/AVG of last line which will not come before END block of awk so same logic above mentioned to get first field and its means of $2,$3 and $4.
  split(i, b," ");
  val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);
};
delete a;
print b[1],val[b[1]]
##printing value of array b with index 1 and array val whose index is value of array b[1] value.
}
' SUBSEP=" "   file17
##Setting SUBSEP as space and Mentioning Input_file name above.

Answer 3

这是一个可以用于此目的的简洁小Awk脚本，

#!/usr/bin/awk

# Setting the input and output field-separators and setting a special variable
# CONVFMT to control the precision width while printing the output
# Change CONVFMT to %.2f if you don't want the rounding of digits

BEGIN { FS=OFS=","; CONVFMT="%.2g" }

NF == 4 {
    # Creating a hash-table based on $1 value by summing up the value present 
    # in each of the other columns present.
    sumOfCol2[$1]+=$2
    sumOfCol3[$1]+=$3
    sumOfCol4[$1]+=$4
    count[$1]++;
}

END {  
    # Print the value (sum)/(count) value with the required precision control  
    for (i in sumOfCol2)
        print i, (sumOfCol2[i]/count[i]), (sumOfCol3[i]/count[i]), (sumOfCol4[i]/count[i])
}

并以

运行脚本

awk -f script.awk file
0.5,4.178,0.766946,0.00957942
0.6,3.736,0.765591,0.011483
0.7,3.8425,0.776997,0.0157075

根据另一列计算列的平均值

3 个答案: