基于第一列组合多个制表符分隔文件的某些列

时间:2014-07-30 20:40:46

标签: awk merge tsv

inFile中的第一列包含一个不一定存在于所有inFiles中的字符串

每个inFile中的第2和第7列包含Title#strings

使用AWK,我无法正确拼凑这些内容。我对描述性变量的使用有望帮助澄清我正在尝试做的事情。这些是我认为我需要的组件:

  1. 制表符分隔的输入文件:-F'\t'
  2. 增加第1列中的字符串,但只将每个'name'添加到'1stColumnNames':!1stColumnNames[$1]++ { name[++i] = $1 }
  3. 为每个.tsv文件创建一个新索引,以存储每个文件的值,以避免覆盖每个列的值:!r[FILENAME]++ { ++argind }
  4. 为每个文件在第2列和第7列中存储相应的列值:{ 2ndColumnVals[$1, argind] = $2 } { 7thColumnVals[$1, argind] = $7 }
  5. 打印所有1stColumnNames以及相关的2ndColumnVals和7thColumnVals,包括其标题'Title1''Title2''Title3'等等:?????
  6. 对于特定的2ndColumnVals或7thColumnVals为空的索引值,打印为Mtee:?????
  7. 对当前工作目录中的所有.tsv文件执行此操作并输出新的tsv文件:*.tsv > outFile.tsv
  8. 示例文件

    inFile1.tsv

    Names   Title1  Title2
    AAAA    1111    123456
    BBBBB   1111    123456
    CCC     1111    123456
    

    inFile2.tsv

    Names   Title3  Title4
    BBBBB   2222    789456
    DDDDD   2222    789456
    EEEE    2222    789456
    

    inFile3.tsv

    Names   Title5  Title6
    AAAA    3333    987654
    CCC     3333    987654
    EEEE    3333    987654
    

    outFile123.tsv

    Names   Title1  Title2  Title3  Title4  Title5  Title6
    AAAA    1111    123456  Mtee    Mtee    3333    987654  
    BBBBB   1111    123456  2222    789456  Mtee    Mtee
    CCC     1111    123456  Mtee    Mtee    3333    987654
    DDDDD   Mtee    Mtee    2222    789456  Mtee    Mtee
    EEEE    Mtee    Mtee    2222    789456  3333    987654
    




    测试脚本

    GNU Awk 4.0.1位于/ usr / bin / awk中,所以我创建了这个文件并在3个输入文件所在的同一个工作目录中执行它:

    命名为script1.sh
    #### Example Usage:  script1.sh inFile1.tsv inFile2.tsv inFile3.tsv > outFile123.tsv
    
    awk -F'\t' '
    FNR==1 { ++numFiles}
    !seen[$1]++ { keys[++numKeys] = $1 }
    { a[$1,numFiles] = $2 FS $3 }
    END {
        for (keyNr=1; keyNr<=numKeys; keyNr++) {
            key = keys[keyNr]
            printf "%s", key
            for (fileNr=1;fileNr<=numFiles;fileNr++) {
                printf "\t%s", ((key,fileNr) in a ? a[key,fileNr] : "Mtee\tMtee")
            }
            print ""
        }
    }
    ' "$@"
    

    正在运行awk -F script1.awk inFile1.tsv inFile2.tsv inFile3.tsv > outFile123.tsv会显示以下错误消息:

    awk: cmd. line:1: inFile1.tsv

    awk: cmd. line:1: ^ syntax error




    来自konsolebox

    的测试脚本2

    运作完美,但我试图通过评论来理解每一行:

    #!/usr/bin/awk -f
    #### named as script2.awk
    #### Example Usage:  awk -f script2.awk inFile1.tsv inFile2.tsv inFile3.tsv > outFile123.tsv
    
    BEGIN { FS = "\t" } #input File Style is tab-delimited
    { sub(/\r/, "") }   #remove all carriage return characters
    !f[FILENAME]++ { ++indx }   #for all files inputted make a single index called indx
    !a[$1]++ { keys[i++] = $1 } #the new indx comprises only unique strings in column 1
    { b[$1, indx] = $2 FS $3 }  #the 2nd and 3rd column are tab delimited and each pair that corresponds to a string saved in keys gets stored after the 1st column string in matrix b
    END {
        for (i = 0; i in keys; ++i) {   #????
            key = keys[i]   #????
            printf "%s", keys   #prints out all strings in the index column 1 stored as keys
            for (j = 1; j <= indx; ++j) {   #????
                v = b[key, j]   #????
                printf "\t%s", length(v) ? v : "Mtee" FS "Mtee" #print out strings as tab delimited and replace any lengths of 1 char to two Mtee separated by a tab
            }
            print ""    #????
        }
    }
    

2 个答案:

答案 0 :(得分:3)

你需要这样的东西:

Gawk版本(用于ARGIND以及gawk 4.0 +中的真正2D阵列):

$ gawk -F'\t' '
!seen[$1]++ { keys[++numKeys] = $1 }
{ a[$1][ARGIND] = $2 FS $3 }
END {
    for (keyNr = 1; keyNr <= numKeys; keyNr++) {
        key = keys[keyNr]
        printf "%s", key
        for (fileNr = 1; fileNr <= ARGIND; fileNr++) {
            printf "\t%s", (fileNr in a[key] ? a[key][fileNr] : "Mtee\tMtee")
        }
        print ""
    }
}
' file1 file2 file3

非gawk版本:

awk -F'\t' '
FNR==1 { ++numFiles}
!seen[$1]++ { keys[++numKeys] = $1 }
{ a[$1,numFiles] = $2 FS $3 }
END {
    for (keyNr=1; keyNr<=numKeys; keyNr++) {
        key = keys[keyNr]
        printf "%s", key
        for (fileNr=1;fileNr<=numFiles;fileNr++) {
            printf "\t%s", ((key,fileNr) in a ? a[key,fileNr] : "Mtee\tMtee")
        }
        print ""
    }
}
' file1 file2 file3
Names   Title1  Title2  Title3  Title4  Title5  Title6
AAAA    1111    123456  Mtee    Mtee    3333    987654
BBBBB   1111    123456  2222    789456  Mtee    Mtee
CCC     1111    123456  Mtee    Mtee    3333    987654
DDDDD   Mtee    Mtee    2222    789456  Mtee    Mtee
EEEE    Mtee    Mtee    2222    789456  3333    987654

答案 1 :(得分:0)

这是另一个awk

#!/usr/bin/awk -f
# Set field separator to tab (\t)
BEGIN { FS = "\t" }
# Remove carriage return characters if file is in DOS format (CRLF)
{ sub(/\r/, "") }
# Increment indx by 1 (starts at 0) everytime a new file is processed
!f[FILENAME]++ { ++indx }
# Add a key ($1) to keys array every time it is first encountered
!a[$1]++ { keys[i++] = $1 }
# Store the 2nd and 3rd field to b matrix
{ b[$1, indx] = $2 FS $3 }
# This block runs after all files are processed
END {
    # Traverse the keys in order
    for (i = 0; i in keys; ++i) {
        key = keys[i]
        # Print key
        printf "%s", key
        # Print columns from every file in order
        for (j = 1; j <= indx; ++j) {
            v = b[key, j]
            printf "\t%s", length(v) ? v : "Mtee" FS "Mtee"
        }
        # End the line with a newline
        print ""
    }
}

用法:

awk -f script.awk file1 file2 file3

输出:

Names   Title1  Title2  Title3  Title4  Title5  Title6
AAAA    1111    123456  Mtee    Mtee    3333    987654
BBBBB   1111    123456  2222    789456  Mtee    Mtee
CCC     1111    123456  Mtee    Mtee    3333    987654
DDDDD   Mtee    Mtee    2222    789456  Mtee    Mtee
EEEE    Mtee    Mtee    2222    789456  3333    987654