我有2个TSV文件:
TSV file 1:
A B
hello 0.5
bye 0.4
TSV file 2:
C D
hello 1
country 5
我想根据file1.A=file2.C
我如何使用linux中的join函数来完成它?
希望得到这个:
Text B D
hello 0.5 1
bye 0.4
country 5
没有获得任何输出:
join -j 1 <(sort -k1 file1.tsv) <(sort -k1 file2.tsv)
答案 0 :(得分:1)
有点毛茸茸,但这是使用awk
和关联数组的解决方案。
awk 'FNR == 1 {h[length(h) + 1] = $2}
FILENAME ~ /test1.tsv/ && FNR > 1 {t1[$1]=$2}
FILENAME ~ /test2.tsv/ && FNR > 1 {t2[$1]=$2}
END{print "Text\t"h[1]"\t"h[2];
for(x in t1){print x"\t"t1[x]"\t"t2[x]}
for(x in t2){print x"\t"t1[x]"\t"t2[x]}}' test1.tsv test2.tsv |
sort | uniq
答案 1 :(得分:1)
<强> File1中强>
$ cat file1
A B
hello 0.5
bye 0.4
<强>文件2 强>
$ cat file2
C D
hello 1
country 5
<强>输出强>
$ awk 'NR==1{print "Text","B","D"}FNR==1{next}FNR==NR{A[$1]=$2;next}{print $0,(f=$1 in A ? A[$1] : ""; if(f)delete A[$1]}END{for(i in A)print i,"",A[i]}' OFS='\t' file2 file1
Text B D
hello 0.5 1
bye 0.4
country 5
更好的可读版本
awk '
# Print header when NR = 1, this happens only when awk reads first file
NR==1{print "Text","B","D"}
# Number of Records relative to the current input file.
# When awk reads from the multiple input file,
# awk NR variable will give the total number of records relative to all the input file.
# Awk FNR will give you number of records for each input file
# So when awk reads first line, stop processing and go to next line
# this is just to skip header from each input file
FNR==1{
next
}
# FNR==NR is only true while reading first file (file2)
FNR==NR{
# Build assicioative array on the first column of the file
# where array element is second column
A[$1]=$2
# Skip all proceeding blocks and process next line
next
}
{
# Check index ($1 = column1) from second argument (file1) exists in array A
# if exists variable f will be 1 (true) otherwise 0 (false)
# As long as above state is true
# print current line and element of array A where index is column1
print $0,( f=$1 in A ? A[$1] : "" )
# Delete array element corresponding to index $1, if f is true
if(f)delete A[$1]
}
# Finally in END block print array elements one by one,
# from file2 which does not exists in file1
END{
for(i in A)
print i,"",A[i]
}
' OFS='\t' file2 file1