按4个字段分组并总结

时间:2015-01-05 13:31:09

标签: awk group-by gawk

以下是我的输入结果:

aNumber bNumber startDate   timeZone    duration    currencyType    cost    dicatedAccused  balanceAfter    trafficCase teleServiceCode location    dataVolume  numberOfEvents  fafIndicator    netWorkID   serviceProvideID    serviceClass    nAno    nBno    bNumberZnCode   fileNamedID Destination Operator    unknown3    MainAmount  ReAnalyse   DEDICATEDACCBALBEF  DEDICATEDACCBALAFT  ACCOUNTGROUPID  SERVICEOFFERINGS    SELECTEDCOMMUNITYID BALANCEBEFORE
22677512549 778 2014-07-02 10:16:35.000 NULL    NULL    localCurrency   0,00    2   11.50   0   3   22676020076 NULL    NULL    NULL    NULL    NULL    34  77512549    778 NULL    1131257 OTHER   Short Code  126244088   0.0000  0   NULL    NULL    NULL    NULL    NULL    11.5000
22675557361 76457227    2014-07-02 10:16:38.000 NULL    NULL    localCurrency   10,00   2   1009.10 0   3   22676613028 NULL    NULL    1   NULL    NULL    35  75557361    76457227    NULL    1131257 Airtel  Airtel  4132206314  10.0000 0   NULL    NULL    NULL    NULL    NULL    1019.1000
22677521277 778 2014-07-04 10:16:42.000 NULL    NULL    localCurrency   0,00    NULL    0.00    0   4   22676020078 NULL    NULL    NULL    NULL    NULL    34  77521277    778 NULL    1131257 OTHER   Short Code  130071591   0.0000  0   NULL    NULL    NULL    NULL    NULL    0.0000
22676099496 77250331    2014-07-03 10:16:42.000 NULL    NULL    localCurrency   1,00    9   0.50    0   4   22676613028 NULL    NULL    NULL    NULL    NULL    35  76099496    77250331    NULL    1131257 Airtel  Airtel  4132218551  0.0000  0   4.0000  3.0000  NULL    NULL    NULL    0.5000
22667222160 22667262389 2014-07-02 10:16:43.000 NULL    NULL    localCurrency   10,00   1   16070.00    0   4   22676613028 NULL    NULL    NULL    NULL    NULL    35  67222160    67262389    NULL    1131257 Airtel  Airtel  4132222628  10.0000 0   NULL    NULL    NULL    NULL    NULL    16080.0000

我必须按日期分组,dicatedAccused,trafficCase和teleserviceCode然后根据这个组的结果我必须总结持续时间,成本,balanceAfter,MainAmount,Balancebefore。
如果总结仅基于一个字段但我们必须使用4个字段进行分组

,则不会有问题

这是我正在使用的awk脚本

awk 'BEGIN {print "date Duration Cost BalanceAfter MainAmount DedicatedAccBalBefore DedicatedAccBalAfter BalanceBefore"} NR == 1 {next} function showday() {
    printf "%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", date, duration, cost, bAfter, main, dedAccbBefore, dedAccbAfter, bBefore} DedicatedAccUsed != $3 {
   if (date) showday()
   date = $3
   duration = cost = bAfter = main = bBefore = dedAccbBefore = dedAccbAfter = 0}{
   sub(/,/, ".", $8)
   duration += $6
   cost += $8
   bAfter += $10
   main += $(NF-7)
   dedAccbBefore += $(NF-5)
   dedAccbAfter += $(NF-4)
   bBefore += $NF}END {showday()}' test.txt | column -t

这里输出我正在寻找

startDate dicatedAccused trafficCase teleServiceCode duration cost balanceAfter MainAmount BALANCEBEFORE 02/07/2014 2 0 3 0 10 1020.60 10 1020.60 02/07/2014 1 0 4 0 10 16070.00 10 16080 03/07/2014 9 0 4 0 1 0 0.0000 0,5 04/07/2014 NULL 0 4 0 0 0 0.0000 0

1 个答案:

答案 0 :(得分:1)

确定输出结果:

date         dAccused  TrafficCase  ServiceCode   Duration       Cost BalanceAfter MainAmount BalanceBefore
2014-07-02          1            0            4          0         10      16070         10      16080
2014-07-03          9            0            4          0          1        0.5          0        0.5
2014-07-04       NULL            0            4          0          0          0          0          0
2014-07-02          2            0            3          0         10     1020.6         10     1030.6

我已经修改了你的脚本,只是使用数组来存储总和,我在这里添加注释:

awk '
NR == 1 {next}
{
   sub(/,/, ".", $8)
   key=sprintf("%-10s %10s %12s %12s",$3,$9,$11,$12) # Create the array key for the "group by" style with text formatting for output
   duration[key] += $6 # Do the duration sum with the key
   cost[key] += $8 # the same for cost
   bAfter[key] += $10
   main[key] += $(NF-7)
   dedAccbBefore += $(NF-5) # Unsure of the real use for this one, so not used after but left
   dedAccbAfter += $(NF-4)
   bBefore[key] += $NF
}
END {
    printf "%-10s %10s %12s %12s %10s %10s %10s %10s %10s\n", "date","dAccused","TrafficCase","ServiceCode","Duration","Cost","BalanceAfter","MainAmount","BalanceBefore" # print the header
    for (i in duration) { # loop over duration array to get the key as index for all the arrays
        printf "%-47s %10s %10s %10s %10s %10s\n", i,duration[i],cost[i],bAfter[i],main[i],bBefore[i] # print the values (key then actual arrays values
    }
}' test.txt

希望明确表达,告诉我是否需要更多细节。