Question

我有两个文件，并尝试根据列比较文件

文件_1

CALL_3  CALL_1
CALL_2  CALL_5
CALL_3  CALL_2
CALL_1  CALL_4

File_2

CALL_1   GAP:A  GAP:G
CALL_3   GAP:C  GAP:Q  GAP:R 
CALL_5   GAP:R  GAP:A
CALL_4   GAP:C  GAP:D  GAP:A  GAP:W
CALL_2   GAP:C  GAP:R  GAP:A

我只想打印来自file_1且至少有一个GAP_id的交互，这两个之间是通用的。

预期产量

CALL_2  CALL_5  GAP:A GAP:R
CALL_3  CALL_2  GAP:C GAP:R
CALL_1  CALL_4  GAP:A

我尝试了以下方法：

awk 'NR==FNR {
a[$1]=($1 OFS $2 OFS $3 OFS $4 OFS $5 OFS $6 OFS $7 OFS $8 OFS $9)
next 
}
($1 in a)&&($2 in a) {
print a[$1],a[$2]
}' File_2 File_1

对于固定数量的列，它运行良好。但是file_2中的列数不是固定的（超过1000列）。如何获得预期的输出？

Answer 1

请您尝试以下。

awk '
FNR==NR{
  val=$1
  $1=""
  $0=$0
  $1=$1
  a[val]=$0
  next
}
{
  val=""
  num1=split(a[$1],array1," ")
  for(i=1;i<=num1;i++){
    array3[array1[i]]
  }
  num2=split(a[$2],array2," ")
  for(i=1;i<=num2;i++){
    array4[array2[i]]
  }
  for(k in array3){
    if(k in array4){
      val=(val?val OFS:"")k
    }
  }
  if(val){
    print $0,val
  }
  val=""
  delete array1
  delete array2
  delete array3
  delete array4
}
'  Input_file2   Input_file1

输出如下。

CALL_2  CALL_5 GAP:A GAP:R
CALL_3  CALL_2 GAP:C GAP:R
CALL_1  CALL_4 GAP:A

说明： ：添加了上述代码的详细说明。

awk '                                  ##Starting awk program here.
FNR==NR{                               ##Checking condition FNR==NR which will be TRUE for first Input_file is being read.
  val=$1                               ##Creating a variable named val whose value is $1 of current line.
  $1=""                                ##Nullifying $1 here.
  $0=$0                                ##Re-assigning value of current line to itself, so that initial space will be removed.
  $1=$1                                ##Re-assigning value of current line to itself, so that initial space will be removed.
  a[val]=$0                            ##Creating an array named a whose index is val and value is $0.
  next                                 ##next will skip all further statements from here.
}
{
  val=""                               ##Nullifying variable val here.
  num1=split(a[$1],array1," ")         ##splitting array a with index $1 to array1 and having its total number in num1.
  for(i=1;i<=num1;i++){                ##Starting a for loop from i=1 till value of num1
    array3[array1[i]]                  ##Creating an array named array3 with index of array1 with index i.
  }
  num2=split(a[$2],array2," ")         ##splitting array a with index $2 to array2 and having its total number in num2.
  for(i=1;i<=num2;i++){                ##Starting a for loop from i=1 till value of num2.
    array4[array2[i]]                  ##Creating an array named array4 with value of array2 with index i.
  }
  for(k in array3){                    ##Traversing through array3 here.
    if(k in array4){                   ##Checking condition if k which is index of  array3 is present in array4 then do following.
      val=(val?val OFS:"")k            ##Creating variable named val whose value is variable k with concatenating its own value each time to it.
    }
  }
  if(val){                             ##Checking condition if variable val is NOT NULL then do following.
    print $0,val                       ##Printing current line and variable val here.
  }
  val=""                               ##Nullifying variable val here.
  delete array1                        ##Deleting array1 here.
  delete array2                        ##Deleting array2 here.
  delete array3                        ##Deleting array3 here.
  delete array4                        ##Deleting array4 here.
}
'  Input_file2  Input_file1            ##Mentioning Input_file names here.

Answer 2

对于数组数组，使用GNU awk：

$ cat tst.awk
NR==FNR {
    for (i=2; i<=NF; i++) {
        gaps[$1][$i]
    }
    next
}
{
    common = ""
    for (gap in gaps[$1]) {
        if (gap in gaps[$2]) {
            common = common OFS gap
        }
    }
    if ( common != "" ) {
        print $0 common
    }
}

$ awk -f tst.awk file2 file1
CALL_2  CALL_5 GAP:A GAP:R
CALL_3  CALL_2 GAP:C GAP:R
CALL_1  CALL_4 GAP:A

任何awk：

$ cat tst.awk
NR==FNR {
    key = $1
    sub(/[^[:space:]]+[[:space:]]+/,"")
    gaps[key] = $0
    next
}
{
    mkSet(gaps[$1],gaps1)
    mkSet(gaps[$2],gaps2)
    common = ""
    for (gap in gaps1) {
        if (gap in gaps2) {
            common = common OFS gap
        }
    }
    if ( common != "" ) {
        print $0 common
    }
}
function mkSet(str,arr, i,tmp) {
    delete arr
    split(str,tmp)
    for (i in tmp) {
        arr[tmp[i]]
    }
}

$ awk -f tst.awk file2 file1
CALL_2  CALL_5 GAP:A GAP:R
CALL_3  CALL_2 GAP:C GAP:R
CALL_1  CALL_4 GAP:A

Answer 3

我用coreutils在bash中做到了。一个班轮：

join -12 -21 <(join -11 -21 <(sort file_1) <(sort file_2) | sort -k2) <(sort file_2) | xargs -l1 bash -c 'a=$(<<<"${@:3}" tr " " "\n" | sort | uniq -d | tr "\n" " "); if [ -n "$a" ]; then printf "%s %s %s\n" "$1" "$2" "$a"; fi' --

或者更多行：

join -12 -21 <(
     join -11 -21 <(sort file_1) <(sort file_2) | sort -k2
) <(
     sort file_2
) | 
xargs -l1 bash -c '
  a=$(<<<"${@:3}" tr " " "\n" | sort | uniq -d | tr "\n" " ");
  if [ -n "$a" ]; then
     printf "%s %s %s\n" "$1" "$2" "$a"
  fi
' --

在第一个字段中将file_1和file_2结合在一起。
再次使用file_2将字段2中点1的结果重新加入
然后针对每一行：
1. 仅获取GAP *部分的副本
2. 如果有重复项，请打印带有重复项的CALL _ *

结果：

CALL_2 CALL_3 GAP:C GAP:R 
CALL_4 CALL_1 GAP:A 
CALL_5 CALL_2 GAP:A GAP:R

Answer 4

使用awk，这很简单：

$ awk '(NR==FNR){$1=$1;a[$1]=$0;next}
       {str=strt=$1 OFS $2}
       {split(a[$1],b,OFS)}
       {for(i in b) if(index(a[$2] OFS, OFS b[i] OFS)) str=str OFS a[$2]} 
       (str!=strt){print str}' file2 file1

工作原理：

(NR==FNR){$1=$1;a[$1]=$0;next}

第一行将file2缓冲在关联数组a[key]=value中，其中key是第一元素，value是整行。例如
```
a["CALL_1"]="CALL_1 GAP:A GAP:G"
```
请注意，我们使用FS将所有OFS替换为$1=$1。
{str=strt=$1 OFS $2}

这仅将CALL_1 CALL_2存储在变量str
{split(a[$1],b,OFS)} ：将缓冲的行拆分为数组b
{for(i in b) if(index(a[$2] OFS, OFS b[i] OFS)) str=str OFS a[$2]}

对于数组b中的所有条目，检查是否在字符串OFS b[i] OFS中找到了字符串a[$2] OFS。我们添加了额外的OFS以确保字段匹配。我们会测试OFS CALL_2 OFS之类的值，但这将永远不会匹配。这是一个很小的开销，但是解决此问题将产生更多的开销。

更优化的版本将显示为：

$ awk '(NR==FNR){k=$1;$1="";a[k]=$1;c[k]=NF-1;next}
       {str=strt=$1 OFS $2}
       (c[$1]< c[$2]) {split(substr(a[$1],2),b,OFS);s=a[$2] OFS}
       (c[$1]>=c[$2]) {split(substr(a[$2],2),b,OFS);s=a[$1] OFS}
       {for(i in b) if(index(s, OFS b[i] OFS)) str=str OFS a[$2]} 
       (str!=strt){print str}' file2 file1

比较两个文件的列

4 个答案: