对唯一字符串进行排序,创建列和平均值

时间:2015-04-08 21:38:51

标签: sorting awk

输入文件:

civil   4
posición    3
formación   7
posición    5
domingo 1
retrato 5
retrato 6
civil   6
formación   3
retrato 7
domingo 7
media   1
media   1

我希望输出为:

civil   4   domingo 1   formación   3   media   1   posición    3   retrato 5
 civil  6   domingo 7   formación   7   media   1   posición    5   retrato 6
average#    average#    average#        average#    average#        retrato 7
                                                                    average#

所以我可以sort -t","将原始输入设为

civil   4
civil   6
domingo 1
domingo 7
formación   3
formación   7
media   1
media   1
posición    3
posición    5
retrato 5
retrato 6
retrato 7

和类似awk '{x+=$insertcolumn} END { for (x> 0) print x/NR }'之类的东西可以获得平均值但是如何在中间步骤中获得列格式?

2 个答案:

答案 0 :(得分:2)

$ cat tst.awk               
BEGIN { nw=length("average"); vw=1 }
!seenCnt[$1]++ { keys[++numKeys]=$1 }
{
    vals[$1,seenCnt[$1]] = $2
    nw = (length($1) > nw ? length($1) : nw)
    vw = (length($2) > vw ? length($2) : vw)
    numRows = (seenCnt[$1] > numRows ? seenCnt[$1] : numRows)
}
END {
    for (rowNr=1; rowNr<=(numRows+1); rowNr++) {
        for (keyNr=1; keyNr<=numKeys; keyNr++) {
            key = keys[keyNr]
            name = val = ""
            if ( (key,rowNr) in vals ) {
                name = key
                val  = vals[key,rowNr]
                sum[key] += vals[key,rowNr]
            }
            else if (key in sum) {
                name = "average"
                val  = sum[key]/(rowNr-1)
                delete sum[key] 
            }
            printf "%-*s %*s%s", nw, name, vw, val, (keyNr<numKeys?OFS:ORS)
        }
    }
}

$ sort file | awk -f tst.awk
civil     4 domingo   1 formación 3 media     1 posición  3 retrato   5
civil     6 domingo   7 formación 7 media     1 posición  5 retrato   6
average   5 average   4 average   5 average   1 average   4 retrato   7
                                                            average   6

答案 1 :(得分:1)

考虑您的输入有逗号分隔值:

代码

gawk <inputFile -F, 'BEGIN{max=0; maxl=0}$2 != ""{x=$1; a[x][0]+=$2; l=length(a[x]); a[x][l]=$2; if (l > max) max=l; l2=length($1); if (l2>maxl) maxl=l2}END{i=0; n=maxl+2; while (i<max){i++; for (j in a) {if (!a[j][i]) {printf("%"n"s %2s","",""); if (!b[j]) b[j]=a[j][0]/(i-1)} else {printf("%"n"s %2s",j,a[j][i]); if (i==max) b[j]=a[j][0]/i}}; print ""; }; print ""; for (j in a) {printf("%"maxl"s %.2f","avg",b[j])}; print ""}'

解释版本

BEGIN {
  max=0                 # used to know how many lines to print
  maxl=0                # used to know how wide a column will be
}
$2 != "" {              # For all non-empty lines, do this block
  x=$1
  a[x][0]+=$2           # create the sum while reading input
                        # also used to make a[x] an array
  l=length(a[x])
  a[x][l]=$2            # appending to the array the new value
  if (l > max) max=l
  l2=length($1)
  if (l2>maxl) maxl=l2  # getting the longest word length
}
END {
  i=0
  n=maxl+2              # pretty print with additional spaces
  while (i<max){
    i++                 # skip 0-value which is the sum
    for (j in a) {
      if (!a[j][i]) {
        printf("%"n"s %2s","","")      # empty column
        if (!b[j]) b[j]=a[j][0]/(i-1)  # calculate average
      } else {
        printf("%"n"s %2s",j,a[j][i])  # show column
        if (i==max) b[j]=a[j][0]/i     # calculate average
      }
    }
    print ""                           # start next line
  }
  print ""                             # skip a line
  for (j in a) {
    printf("%"maxl"s %.2f","avg",b[j]) # print averages
  }
  print ""                             # end output with a newline
}

输入

civil,4
posición,3
formación,7
posición,5
domingo,1
retrato,5
retrato,6
civil,6
formación,3
retrato,7
domingo,7
media,1
media,1

输出

    domingo  1   posición  3      media  1    retrato  5      civil  4  formación  7
    domingo  7   posición  5      media  1    retrato  6      civil  6  formación  3
                                              retrato  7

     avg  4.00     avg  4.00     avg  1.00     avg  6.00     avg  5.00     avg  5.00

编辑非gawk

awk不能在数组上使用length(),因此我们将长度存储在另一个数组中。

l=length(a[x])
a[x][l]=$2
if (l > max) max=l

需要改为

l[x]++
a[x][l[x]]=$2
if (l[x] > max) max=l[x]

awk one-liner

awk <inputFile -F, 'BEGIN{max=0; maxl=0}$2 != ""{x=$1; a[x][0]+=$2; l[x]++; a[x][l[x]]=$2; if (l[x] > max) max=l[x]; l2=length($1); if (l2>maxl) maxl=l2}END{i=0; n=maxl+2; while (i<max){i++; for (j in a) {if (!a[j][i]) {printf("%"n"s %2s","",""); if (!b[j]) b[j]=a[j][0]/(i-1)} else {printf("%"n"s %2s",j,a[j][i]); if (i==max) b[j]=a[j][0]/i}}; print ""; }; print ""; for (j in a) {printf("%"maxl"s %.2f","avg",b[j])}; print ""}'

(如果您awk使用gawk,请使用gawk --posix

加成

留给读者练习:

替换最后一个for (...){print ...}循环以允许按字母顺序对输出列进行排序。