我有两个文件,并尝试根据列比较文件
文件_1
CALL_3 CALL_1
CALL_2 CALL_5
CALL_3 CALL_2
CALL_1 CALL_4
File_2
CALL_1 GAP:A GAP:G
CALL_3 GAP:C GAP:Q GAP:R
CALL_5 GAP:R GAP:A
CALL_4 GAP:C GAP:D GAP:A GAP:W
CALL_2 GAP:C GAP:R GAP:A
我只想打印来自file_1且至少有一个GAP_id的交互,这两个之间是通用的。
预期产量
CALL_2 CALL_5 GAP:A GAP:R
CALL_3 CALL_2 GAP:C GAP:R
CALL_1 CALL_4 GAP:A
我尝试了以下方法:
awk 'NR==FNR {
a[$1]=($1 OFS $2 OFS $3 OFS $4 OFS $5 OFS $6 OFS $7 OFS $8 OFS $9)
next
}
($1 in a)&&($2 in a) {
print a[$1],a[$2]
}' File_2 File_1
对于固定数量的列,它运行良好。但是file_2中的列数不是固定的(超过1000列)。如何获得预期的输出?
答案 0 :(得分:2)
请您尝试以下。
awk '
FNR==NR{
val=$1
$1=""
$0=$0
$1=$1
a[val]=$0
next
}
{
val=""
num1=split(a[$1],array1," ")
for(i=1;i<=num1;i++){
array3[array1[i]]
}
num2=split(a[$2],array2," ")
for(i=1;i<=num2;i++){
array4[array2[i]]
}
for(k in array3){
if(k in array4){
val=(val?val OFS:"")k
}
}
if(val){
print $0,val
}
val=""
delete array1
delete array2
delete array3
delete array4
}
' Input_file2 Input_file1
输出如下。
CALL_2 CALL_5 GAP:A GAP:R
CALL_3 CALL_2 GAP:C GAP:R
CALL_1 CALL_4 GAP:A
说明: :添加了上述代码的详细说明。
awk ' ##Starting awk program here.
FNR==NR{ ##Checking condition FNR==NR which will be TRUE for first Input_file is being read.
val=$1 ##Creating a variable named val whose value is $1 of current line.
$1="" ##Nullifying $1 here.
$0=$0 ##Re-assigning value of current line to itself, so that initial space will be removed.
$1=$1 ##Re-assigning value of current line to itself, so that initial space will be removed.
a[val]=$0 ##Creating an array named a whose index is val and value is $0.
next ##next will skip all further statements from here.
}
{
val="" ##Nullifying variable val here.
num1=split(a[$1],array1," ") ##splitting array a with index $1 to array1 and having its total number in num1.
for(i=1;i<=num1;i++){ ##Starting a for loop from i=1 till value of num1
array3[array1[i]] ##Creating an array named array3 with index of array1 with index i.
}
num2=split(a[$2],array2," ") ##splitting array a with index $2 to array2 and having its total number in num2.
for(i=1;i<=num2;i++){ ##Starting a for loop from i=1 till value of num2.
array4[array2[i]] ##Creating an array named array4 with value of array2 with index i.
}
for(k in array3){ ##Traversing through array3 here.
if(k in array4){ ##Checking condition if k which is index of array3 is present in array4 then do following.
val=(val?val OFS:"")k ##Creating variable named val whose value is variable k with concatenating its own value each time to it.
}
}
if(val){ ##Checking condition if variable val is NOT NULL then do following.
print $0,val ##Printing current line and variable val here.
}
val="" ##Nullifying variable val here.
delete array1 ##Deleting array1 here.
delete array2 ##Deleting array2 here.
delete array3 ##Deleting array3 here.
delete array4 ##Deleting array4 here.
}
' Input_file2 Input_file1 ##Mentioning Input_file names here.
答案 1 :(得分:2)
对于数组数组,使用GNU awk:
$ cat tst.awk
NR==FNR {
for (i=2; i<=NF; i++) {
gaps[$1][$i]
}
next
}
{
common = ""
for (gap in gaps[$1]) {
if (gap in gaps[$2]) {
common = common OFS gap
}
}
if ( common != "" ) {
print $0 common
}
}
$ awk -f tst.awk file2 file1
CALL_2 CALL_5 GAP:A GAP:R
CALL_3 CALL_2 GAP:C GAP:R
CALL_1 CALL_4 GAP:A
任何awk:
$ cat tst.awk
NR==FNR {
key = $1
sub(/[^[:space:]]+[[:space:]]+/,"")
gaps[key] = $0
next
}
{
mkSet(gaps[$1],gaps1)
mkSet(gaps[$2],gaps2)
common = ""
for (gap in gaps1) {
if (gap in gaps2) {
common = common OFS gap
}
}
if ( common != "" ) {
print $0 common
}
}
function mkSet(str,arr, i,tmp) {
delete arr
split(str,tmp)
for (i in tmp) {
arr[tmp[i]]
}
}
$ awk -f tst.awk file2 file1
CALL_2 CALL_5 GAP:A GAP:R
CALL_3 CALL_2 GAP:C GAP:R
CALL_1 CALL_4 GAP:A
答案 2 :(得分:1)
我用coreutils在bash中做到了。一个班轮:
join -12 -21 <(join -11 -21 <(sort file_1) <(sort file_2) | sort -k2) <(sort file_2) | xargs -l1 bash -c 'a=$(<<<"${@:3}" tr " " "\n" | sort | uniq -d | tr "\n" " "); if [ -n "$a" ]; then printf "%s %s %s\n" "$1" "$2" "$a"; fi' --
或者更多行:
join -12 -21 <(
join -11 -21 <(sort file_1) <(sort file_2) | sort -k2
) <(
sort file_2
) |
xargs -l1 bash -c '
a=$(<<<"${@:3}" tr " " "\n" | sort | uniq -d | tr "\n" " ");
if [ -n "$a" ]; then
printf "%s %s %s\n" "$1" "$2" "$a"
fi
' --
结果:
CALL_2 CALL_3 GAP:C GAP:R
CALL_4 CALL_1 GAP:A
CALL_5 CALL_2 GAP:A GAP:R
答案 3 :(得分:1)
使用awk,这很简单:
$ awk '(NR==FNR){$1=$1;a[$1]=$0;next}
{str=strt=$1 OFS $2}
{split(a[$1],b,OFS)}
{for(i in b) if(index(a[$2] OFS, OFS b[i] OFS)) str=str OFS a[$2]}
(str!=strt){print str}' file2 file1
工作原理:
(NR==FNR){$1=$1;a[$1]=$0;next}
第一行将file2
缓冲在关联数组a[key]=value
中,其中key
是第一元素,value
是整行。例如
a["CALL_1"]="CALL_1 GAP:A GAP:G"
请注意,我们使用FS
将所有OFS
替换为$1=$1
。
{str=strt=$1 OFS $2}
这仅将CALL_1 CALL_2
存储在变量str
{split(a[$1],b,OFS)}
:将缓冲的行拆分为数组b
{for(i in b) if(index(a[$2] OFS, OFS b[i] OFS)) str=str OFS a[$2]}
对于数组b
中的所有条目,检查是否在字符串OFS b[i] OFS
中找到了字符串a[$2] OFS
。我们添加了额外的OFS
以确保字段匹配。我们会测试OFS CALL_2 OFS
之类的值,但这将永远不会匹配。这是一个很小的开销,但是解决此问题将产生更多的开销。
更优化的版本将显示为:
$ awk '(NR==FNR){k=$1;$1="";a[k]=$1;c[k]=NF-1;next}
{str=strt=$1 OFS $2}
(c[$1]< c[$2]) {split(substr(a[$1],2),b,OFS);s=a[$2] OFS}
(c[$1]>=c[$2]) {split(substr(a[$2],2),b,OFS);s=a[$1] OFS}
{for(i in b) if(index(s, OFS b[i] OFS)) str=str OFS a[$2]}
(str!=strt){print str}' file2 file1