使用awk命令枢纽分析表内容

时间:2019-04-24 03:15:57

标签: awk

我想使用awk透视表内容,有人可以分享我将如何做。谢谢

表1

FEATURE,TESTER,LICENSE_USED,PROGRAM,AREA
Low,T6712,23,Element01,FT1
High,T7911,54,Element03,FT2
Medium,E8123,48,Element02,FT3
High,F4309,54,Element02,PB1
Low,F4309,23,Element01,PB1
Low,T7911,23,Element04,FT1
High,E8123,54,Element05,FT2
Medium,F4309,48,Element01,PB1

预期的输出,“功能”将变为列,其中包含使用的许可证中的值

TESTER,PROGRAM,AREA,High,Low,Medium
E8123,Element02,FT3,0,0,48
E8123,Element05,FT2,54,0,0
F4309,Element01,PB1,0,23,48
F4309,Element02,PB1,54,0,0
T6712,Element01,FT1,0,23,0
T7911,Element03,FT2,54,0,
T7911,Element04,FT1,0,23,0

另一个输出,是否有可能像下面那样转置预期的输出?

TESTER,E8123,E8123,F4309,F4309,T6712,T7911,T7911
PROGRAM,Element02,Element05,Element01,Element02,Element01,Element03,Element04
AREA,FT3,FT2,PB1,PB1,FT1,FT2,FT1
High,0,54,0,54,0,54,0
Low,0,0,23,0,23,0,23
Medium,48,0,48,0,0,,0

2 个答案:

答案 0 :(得分:1)

$ awk '
  BEGIN {
    # set input, output and subscript separator to comma
    FS = OFS = SUBSEP = ","
  }

  NR > 1 {
    # keep test-program-area in `a` for future reference
    a[$2, $4, $5] 
    # associate license_used with test-program-area-(high/low/medium) in `b`
    b[$2, $4, $5, $1] = $3
  }

  END {
    print "TESTER", "PROGRAM", "AREA", "High", "Low", "Medium"
    # for each key in `a`
    for (k in a)
      # print (test, program, area), high, low, medium
      print k, int(b[k, "High"]), int(b[k, "Low"]), int(b[k, "Medium"])
  }
' file
TESTER,PROGRAM,AREA,High,Low,Medium
F4309,Element02,PB1,54,0,0
E8123,Element02,FT3,0,48,0
T6712,Element01,FT1,0,0,23
T7911,Element03,FT2,54,0,0
F4309,Element01,PB1,0,48,23
E8123,Element05,FT2,54,0,0
T7911,Element04,FT1,0,0,23

如果您要寻找花哨的单线,则没有空格和注释:

awk 'BEGIN {FS=OFS=SUBSEP=","} NR>1{a[$2,$4,$5];b[$2,$4,$5,$1]=$3} END{print "TESTER","PROGRAM","AREA","High","Low","Medium";for(c in a) print k,0+b[k,"High"],0+b[k,"Low"],0+b[k,"Medium"]}' file

对于移调输出:

$ awk '
BEGIN {
  FS = OFS = ","
  tr[cell[4, 1] = "High"  ] = 4
  tr[cell[5, 1] = "Low"   ] = 5
  tr[cell[6, 1] = "Medium"] = 6
}
{
  cell[1, NR] = $2
  cell[2, NR] = $4
  cell[3, NR] = $5
}
NR > 1 {
  cell[tr[$1], NR] = $3
}
END {
  for (row = 1; row <= 6; ++row) {
    for (col = 1; col <= NR; ++col)
      if (row > 3 && col > 1)
        $col = int(cell[row, col])
      else
        $col = cell[row, col]
    print
  }
}' file
TESTER,T6712,T7911,E8123,F4309,F4309,T7911,E8123,F4309
PROGRAM,Element01,Element03,Element02,Element02,Element01,Element04,Element05,Element01
AREA,FT1,FT2,FT3,PB1,PB1,FT1,FT2,PB1
High,0,54,0,54,0,0,54,0
Low,23,0,0,0,23,23,0,0
Medium,0,0,48,0,0,0,0,48

AREA进行排序(即第5 列):

$ awk '
BEGIN {
  FS = OFS = ","
  tr[cell[4, 1] = "High"  ] = 4
  tr[cell[5, 1] = "Low"   ] = 5
  tr[cell[6, 1] = "Medium"] = 6
}
{
  cell[1, NR] = $2
  cell[2, NR] = $4
  cell[3, NR] = $5
}
NR > 1 {
  cell[tr[$1], NR] = $3
}
END {
  for (row = 1; row <= 6; ++row) {
    for (col = 1; col <= NR; ++col)
      if (row > 3 && col > 1)
        $col = int(cell[row, col])
      else
        $col = cell[row, col]
    print
  }
}' <(head -n 1 file) <(tail -n +2 file | sort -t ',' -k 5) # <- 5th
TESTER,T6712,T7911,E8123,T7911,E8123,F4309,F4309,F4309
PROGRAM,Element01,Element04,Element05,Element03,Element02,Element02,Element01,Element01
AREA,FT1,FT1,FT2,FT2,FT3,PB1,PB1,PB1
High,0,0,54,54,0,54,0,0
Low,23,23,0,0,0,0,23,0
Medium,0,0,0,0,48,0,0,48

注意:如果您使用的外壳与bash不同,并且无法使最后一个外壳正常工作,请放弃进程替换并使用命令组,例如:

{ head -n 1 file; tail -n +2 file | sort -t ',' -k 5; } | awk '...'

答案 1 :(得分:1)

这是原始输入内容:

FEATURE,TESTER,LICENSE_USED,PROGRAM,AREA
Low,T6712,23,Element01,FT1
High,T7911,54,Element03,FT2
Medium,E8123,48,Element02,FT3
High,F4309,54,Element02,PB1
Low,F4309,23,Element01,PB1
Low,T7911,23,Element04,FT1
High,E8123,54,Element05,FT2
Medium,F4309,48,Element01,PB1

好了,我们可以看看发生了什么事

$ tr , '\011'  < data.txt | column -tR 3,5
FEATURE  TESTER  LICENSE_USED  PROGRAM    AREA
Low      T6712             23  Element01   FT1
High     T7911             54  Element03   FT2
Medium   E8123             48  Element02   FT3
High     F4309             54  Element02   PB1
Low      F4309             23  Element01   PB1
Low      T7911             23  Element04   FT1
High     E8123             54  Element05   FT2
Medium   F4309             48  Element01   PB1

这是预期的输出:

$ tr , '\011'  < expected.txt | column -tR 2,3,4,5,6,7,8
TESTER       E8123      E8123      F4309      F4309      T6712      T7911      T7911
PROGRAM  Element02  Element05  Element01  Element02  Element01  Element03  Element04
AREA           FT3        FT2        PB1        PB1        FT1        FT2        FT1
High             0         54          0         54          0         54          0
Low              0          0         23          0         23          0         23
Medium          48          0         48          0          0          0          0

似乎我们希望将三个列标题转换为行标题,并且我们希望使用低,中和高作为每个测试人员/元素/区域的行标题(按测试人员排序很明显):

$ tr , '\011'  < data.txt | column -tR 3 | sort -k2
Medium   E8123             48  Element02  FT3
High     E8123             54  Element05  FT2
Low      F4309             23  Element01  PB1
Medium   F4309             48  Element01  PB1
High     F4309             54  Element02  PB1
Low      T6712             23  Element01  FT1
Low      T7911             23  Element04  FT1
High     T7911             54  Element03  FT2
FEATURE  TESTER  LICENSE_USED  PROGRAM    AREA

我们可以很容易地看到Testers也可以在不同的Elements上工作,因此我们必须考虑到这一点:

BEGIN {
    FS=","
}
NR > 1 {
    data[$2,$4,$5] = data[$2,$4,$5] $1 ":" $3 FS
}
END {
    #construct the table
    for (tester_element_area in data) {
        split(tester_element_area, parts, SUBSEP)

        tester  = parts[1]
        element = parts[2]
        area    = parts[3]

        n = split(data[tester_element_area], d)

        template["High"]   = 0
        template["Medium"] = 0
        template["Low"]    = 0

        for (i = 1; i <= n; i++) {
            split(d[i], license, ":")

            degree = license[1]
            value  = license[2]

            template[ degree ] = value
        }

        table["TESTER"]  = table["TESTER"]  FS tester
        table["PROGRAM"] = table["PROGRAM"] FS element
        table["AREA"]    = table["AREA"]    FS area
        table["High"]    = table["High"]    FS template["High"]
        table["Medium"]  = table["Medium"]  FS template["Medium"]
        table["Low"]     = table["Low"]     FS template["Low"]
    }

    #print the table
    header[1] = "TESTER"
    header[2] = "PROGRAM"
    header[3] = "AREA"
    header[4] = "High"
    header[5] = "Low"
    header[6] = "Medium"

    for (i = 1; i <= 6; i++) {
        header_name = header[i]

        printf header_name

        n = split(table[header_name], parts)

        for (j = 1; j <= n; j++) {
            if (j > 1) {
                printf FS
            }
            printf parts[j]
        }
        print ""
    }
}

让我们看看它返回什么:

$ awk -f prog.awk < data.txt | tr , '\011' | column -tR2,3,4,5,6,7,8
TESTER       E8123      T7911      F4309      E8123      T6712      T7911      F4309
PROGRAM  Element05  Element04  Element02  Element02  Element01  Element03  Element01
AREA           FT2        FT1        PB1        FT3        FT1        FT2        PB1
High            54          0         54          0          0         54          0
Low              0         23          0          0         23          0         23
Medium           0          0          0         48          0          0         48

不太破旧,列的排列顺序不正确。他们应该排序。如果您愿意使用GAWK,则只需更改一下代码即可

END {
        for (tester_element_area in data) {                                     
                cols[++i] = tester_element_area                                 
        }                                                                       

        m = asort(cols)                                                         

        #construct the table                                                    
        for (k = 1; k <= m; k++) {                                              
                tester_element_area = cols[k]
                ...

输出:

$ awk -f prog.awk < data.txt | tr , '\011' | column -tR2,3,4,5,6,7,8
TESTER       E8123      E8123      F4309      F4309      T6712      T7911      T7911
PROGRAM  Element02  Element05  Element01  Element02  Element01  Element03  Element04
AREA           FT3        FT2        PB1        PB1        FT1        FT2        FT1
High             0         54          0         54          0         54          0
Low              0          0         23          0         23          0         23
Medium          48          0         48          0          0          0          0

更新:按区域排序

NR > 1 {
    data[$5,$2,$4] = data[$5,$2,$4] $1 ":" $3 FS
}
END {
        for (tester_element_area in data) {                                     
                cols[++i] = tester_element_area
        }                                                                       

        m = asort(cols)                                                         

        #construct the table                                                    
        for (k = 1; k <= m; k++) {                                              
                tester_element_area = cols[k]
                split(tester_element_area, parts, SUBSEP)

                area     = parts[1]
                tester   = parts[2]
                element  = parts[3]

输出:

TESTER       T6712      T7911      E8123      T7911      E8123      F4309      F4309
PROGRAM  Element01  Element04  Element05  Element03  Element02  Element01  Element02
AREA           FT1        FT1        FT2        FT2        FT3        PB1        PB1
High             0          0         54         54          0          0         54
Low             23         23          0          0          0         23          0
Medium           0          0          0          0         48         48          0