inFile中的第一列包含一个不一定存在于所有inFiles中的字符串
每个inFile中的第2和第7列包含Title#strings
使用AWK,我无法正确拼凑这些内容。我对描述性变量的使用有望帮助澄清我正在尝试做的事情。这些是我认为我需要的组件:
-F'\t'
!1stColumnNames[$1]++ { name[++i] = $1 }
!r[FILENAME]++ { ++argind }
{ 2ndColumnVals[$1, argind] = $2 } { 7thColumnVals[$1, argind] = $7 }
?????
?????
*.tsv > outFile.tsv
Names Title1 Title2
AAAA 1111 123456
BBBBB 1111 123456
CCC 1111 123456
Names Title3 Title4
BBBBB 2222 789456
DDDDD 2222 789456
EEEE 2222 789456
Names Title5 Title6
AAAA 3333 987654
CCC 3333 987654
EEEE 3333 987654
Names Title1 Title2 Title3 Title4 Title5 Title6
AAAA 1111 123456 Mtee Mtee 3333 987654
BBBBB 1111 123456 2222 789456 Mtee Mtee
CCC 1111 123456 Mtee Mtee 3333 987654
DDDDD Mtee Mtee 2222 789456 Mtee Mtee
EEEE Mtee Mtee 2222 789456 3333 987654
GNU Awk 4.0.1位于/ usr / bin / awk中,所以我创建了这个文件并在3个输入文件所在的同一个工作目录中执行它:
命名为script1.sh#### Example Usage: script1.sh inFile1.tsv inFile2.tsv inFile3.tsv > outFile123.tsv
awk -F'\t' '
FNR==1 { ++numFiles}
!seen[$1]++ { keys[++numKeys] = $1 }
{ a[$1,numFiles] = $2 FS $3 }
END {
for (keyNr=1; keyNr<=numKeys; keyNr++) {
key = keys[keyNr]
printf "%s", key
for (fileNr=1;fileNr<=numFiles;fileNr++) {
printf "\t%s", ((key,fileNr) in a ? a[key,fileNr] : "Mtee\tMtee")
}
print ""
}
}
' "$@"
正在运行awk -F script1.awk inFile1.tsv inFile2.tsv inFile3.tsv > outFile123.tsv
会显示以下错误消息:
awk: cmd. line:1: inFile1.tsv
awk: cmd. line:1: ^ syntax error
#!/usr/bin/awk -f
#### named as script2.awk
#### Example Usage: awk -f script2.awk inFile1.tsv inFile2.tsv inFile3.tsv > outFile123.tsv
BEGIN { FS = "\t" } #input File Style is tab-delimited
{ sub(/\r/, "") } #remove all carriage return characters
!f[FILENAME]++ { ++indx } #for all files inputted make a single index called indx
!a[$1]++ { keys[i++] = $1 } #the new indx comprises only unique strings in column 1
{ b[$1, indx] = $2 FS $3 } #the 2nd and 3rd column are tab delimited and each pair that corresponds to a string saved in keys gets stored after the 1st column string in matrix b
END {
for (i = 0; i in keys; ++i) { #????
key = keys[i] #????
printf "%s", keys #prints out all strings in the index column 1 stored as keys
for (j = 1; j <= indx; ++j) { #????
v = b[key, j] #????
printf "\t%s", length(v) ? v : "Mtee" FS "Mtee" #print out strings as tab delimited and replace any lengths of 1 char to two Mtee separated by a tab
}
print "" #????
}
}
答案 0 :(得分:3)
你需要这样的东西:
Gawk版本(用于ARGIND以及gawk 4.0 +中的真正2D阵列):
$ gawk -F'\t' '
!seen[$1]++ { keys[++numKeys] = $1 }
{ a[$1][ARGIND] = $2 FS $3 }
END {
for (keyNr = 1; keyNr <= numKeys; keyNr++) {
key = keys[keyNr]
printf "%s", key
for (fileNr = 1; fileNr <= ARGIND; fileNr++) {
printf "\t%s", (fileNr in a[key] ? a[key][fileNr] : "Mtee\tMtee")
}
print ""
}
}
' file1 file2 file3
非gawk版本:
awk -F'\t' '
FNR==1 { ++numFiles}
!seen[$1]++ { keys[++numKeys] = $1 }
{ a[$1,numFiles] = $2 FS $3 }
END {
for (keyNr=1; keyNr<=numKeys; keyNr++) {
key = keys[keyNr]
printf "%s", key
for (fileNr=1;fileNr<=numFiles;fileNr++) {
printf "\t%s", ((key,fileNr) in a ? a[key,fileNr] : "Mtee\tMtee")
}
print ""
}
}
' file1 file2 file3
Names Title1 Title2 Title3 Title4 Title5 Title6
AAAA 1111 123456 Mtee Mtee 3333 987654
BBBBB 1111 123456 2222 789456 Mtee Mtee
CCC 1111 123456 Mtee Mtee 3333 987654
DDDDD Mtee Mtee 2222 789456 Mtee Mtee
EEEE Mtee Mtee 2222 789456 3333 987654
答案 1 :(得分:0)
这是另一个awk
:
#!/usr/bin/awk -f
# Set field separator to tab (\t)
BEGIN { FS = "\t" }
# Remove carriage return characters if file is in DOS format (CRLF)
{ sub(/\r/, "") }
# Increment indx by 1 (starts at 0) everytime a new file is processed
!f[FILENAME]++ { ++indx }
# Add a key ($1) to keys array every time it is first encountered
!a[$1]++ { keys[i++] = $1 }
# Store the 2nd and 3rd field to b matrix
{ b[$1, indx] = $2 FS $3 }
# This block runs after all files are processed
END {
# Traverse the keys in order
for (i = 0; i in keys; ++i) {
key = keys[i]
# Print key
printf "%s", key
# Print columns from every file in order
for (j = 1; j <= indx; ++j) {
v = b[key, j]
printf "\t%s", length(v) ? v : "Mtee" FS "Mtee"
}
# End the line with a newline
print ""
}
}
用法:
awk -f script.awk file1 file2 file3
输出:
Names Title1 Title2 Title3 Title4 Title5 Title6
AAAA 1111 123456 Mtee Mtee 3333 987654
BBBBB 1111 123456 2222 789456 Mtee Mtee
CCC 1111 123456 Mtee Mtee 3333 987654
DDDDD Mtee Mtee 2222 789456 Mtee Mtee
EEEE Mtee Mtee 2222 789456 3333 987654