我有两个用于与制表符分隔文件组合的文件。 这两个文件的密钥可能只有" Reads"号。
我想比较这两个文件,并根据子串键和匹配进行组合。
例如,
File1 Key : "Cluster0_Reads255"
File2 Key : "Cluster0_Reads50"
This case is same because "Cluster0_Reads" is identical.
这种情况我想将两列与File1键名结合起来。 请查看以下示例案例。
文件1。
A B
Cluster0_Reads255 500
Cluster1_Reads253 300
Cluster2_Reads100 200
Cluster3_Reads100 350
文件2。
A C
Cluster0_Reads50 GE
Cluster1_Reads200 GA
Cluster2_Reads100 GA
结果。
A B C
Cluster0_Reads255 500 GE
Cluster1_Reads253 300 GA
Cluster2_Reads100 200 GA
Cluster3_Reads100 350 -
我用完全匹配找到了一个awk,如下所示组合,
awk '
BEGIN { FS = OFS = "\t" }
{key = $1}
FNR == NR {result[key] = $0; next;}
(key in result) { updated[key]=1 ; for (i=2; i <= NF; i++) result[key] = result[key] FS $i }
END {
PROCINFO["sorted_in"] = "@ind_str_asc" # if using GNU awk
for (key in result) {
if(!(key in updated)) result[key] = result[key] FS "-"
if(!(length(key)==0)) print result[key]
}
}
' File1 File2 > File3
在子字符串之后有什么方法可以合并吗?
谢谢。
答案 0 :(得分:2)
Below kinda dirty awk
script does the job, but i'm sure you would find a better one ere.
awk -v FS="\t" -v OFS="\t" '
NR==FNR{f1=$1;sub(/[0-9]*$/,"",f1);file1info[f1]=$0;next}
{sub(/[0-9]*$/,"",$1);file2info[$1]=$2}
END{
for(i in file1info){
print file1info[i],(i in file2info)?file2info[i]:"-";
}
}' File1 File2 | expand -t 20 | sort -nk1
Output
A B C
Cluster0_Reads255 500 GE
Cluster1_Reads253 300 GA
Cluster2_Reads100 200 GA
Cluster3_Reads100 350 -
Edit
Finally managed to get a smaller faster one. The trick was reversing the files considering that file2's clusters always form a subset of file1's.
awk -v FS="\t" -v OFS="\t" '
NR==FNR{sub(/[0-9]*$/,"",$1);file2info[$1]=$2;next}
{f1=$1;sub(/[0-9]*$/,"",f1);print $0,(f1 in file2info)?file2info[f1]:"-"}
' File2 File1 | expand -t 20 |sort -nk1
Output
A B C
Cluster0_Reads255 500 GE
Cluster1_Reads253 300 GA
Cluster2_Reads100 200 GA
Cluster3_Reads100 350 -
答案 1 :(得分:1)
在Gnu AWK:
$ cat > do.awk
FNR==NR {
a[gensub(/[0-9]+$/,"","g",$1)]=$0 # remove nums from end of index
next
}
(i=gensub(/[0-9]+$/,"","g",$1)) && (i in a) { # if match in a
sub(/\t/,OFS $2 OFS,a[i]) # change order
$0=a[i]
} 1 # print
$ awk -v OFS="\t" file2 file1
A B C
Cluster0_Reads50 500 GE
Cluster1_Reads200 300 GA
Cluster2_Reads100 200 GA
Cluster3_Reads100 350
答案 2 :(得分:1)
$ cat tst.awk
BEGIN { FS=OFS="\t" }
{ key=$1; sub(/[0-9]+$/,"",key) }
NR==FNR { map[key]=$2; next }
{ print $0, (key in map ? map[key] : "-") }
$ awk -f tst.awk file2 file1
A B C
Cluster0_Reads255 500 GE
Cluster1_Reads253 300 GA
Cluster2_Reads100 200 GA
Cluster3_Reads100 350 -
$ awk -f tst.awk file2 file1 | column -s$'\t' -t
A B C
Cluster0_Reads255 500 GE
Cluster1_Reads253 300 GA
Cluster2_Reads100 200 GA
Cluster3_Reads100 350 -