awk比较三个文件并合并输出

时间:2014-08-01 07:10:13

标签: awk

我有3个文件1.csv和2.csv和3.csv

1.csv

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1  
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791  
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373  
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2  
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85  

2.csv

SIMINN_ICELAND_TELECOM,ICELAND,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795  
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638

3.csv:

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Calling_Party_Address_Blocking,79  
CABLE&WIRELESS_BARBADOS,BARBADOS,Calling_Party_Address_Blocking,30  
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,Calling_Party_Address_Blocking,12

我想合并文件,以便打印输出文件,如下所示

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA 
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12

我的剧本:

awk ' BEGIN { FS = OFS ="," } 
FNR==NR {
    a[$1 FS $2] = $3 FS $4; 
    next
} 
{
    print $0, (($1 FS $2) in a?a[$1 FS $2]:"NA,NA")
    delete a[$1 FS $2]
}
END {
    for(left in a) print left,"NA,NA", a[left]
}' 1.csv 2.csv 3.csv

输出:

SIMINN_ICELAND_TELECOM,ICELAND,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,Alphanumeric_A_MSISDN_blocking,109373  
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,Alphanumeric_A_MSISDN_blocking,2  
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Calling_Party_Address_Blocking,79,Alphanumeric_A_MSISDN_blocking,1  
CABLE&WIRELESS_BARBADOS,BARBADOS,Calling_Party_Address_Blocking,30,Alphanumeric_A_MSISDN_blocking,791  
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,Calling_Party_Address_Blocking,6,NA,NA  
SYNIVERSE_ANSI,UNITED_STATES,Calling_Party_Address_Blocking,12,NA,NA  
CABLE&WIRELESS_JAMAICA,JAMAICA,NA,NA,Alphanumeric_A_MSISDN_blocking,85  

如您所见,我的输出管理不善,请提供解决方法。

2 个答案:

答案 0 :(得分:1)

以下是如何做到的:

BEGIN { FS = "," }

FNR==1 { ++file }

{
    a[$1,$2,file] = $3 FS $4
    ++seen[$1,$2]
}

END {
    for (j in seen) {
        split(j, b, SUBSEP)
        s = b[1] FS b[2]
        for (i=1; i<=file; ++i) {
            s = s FS (j SUBSEP i in a ? a[j,i] : "NA" FS "NA")
        }
        print s
    }
}

以前看起来像1.csv是主记录,2.csv和3.csv只包含现有记录的附加数据。

每次新文件开始时,递增file计数器。使用前两列并将文件计数器作为键,将找到的每条记录添加到数组a。将前两列添加到seen数组中,以便知道完整的键列表。

在结束块中,遍历所看到的数组。使用&#34; NA&#34;将每个记录合并在一起。如果记录的一部分丢失了。

测试出来:

$ awk -f merge.awk 1.csv 2.csv 3.csv 
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA

请注意,输出的顺序已更改,因为它取决于键在seen中的显示顺序。施加严格的命令将是一个额外的步骤,我不确定是否需要。

答案 1 :(得分:0)

#!/usr/bin/awk -f
BEGIN { FS = "," }
function include_missing(  key) {
    append = ""
    for (i = field_count[file_index] - 2; i > 0; --i) {
        append = append ",NA"
    }
    for (i = 1; i <= k; ++i) {
        key = keys[i]
        if (validity[key] < file_index) {
            data[key] = data[key] append
            ++validity[key]
        }
    }
}
{
    sub(/[ \t\r]*$/, "")
    key = $1 FS $2
    if (FNR == 1) {
        if (file_index) {
            include_missing()
        }
        field_count[++file_index] = NF
    }
    if (file_index == 1) {
        keys[++k] = key
    }
    ++validity[key]
    data[key] = data[key] substr($0, length(key) + 1)
}
END {
    include_missing()
    for (i = 1; i <= k; ++i) {
        key = keys[i]
        print key data[key]
    }
}

用法:

awk -f script.awk file1 file2 fil3

输出:

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,Calling_Party_Address_Blocking,1160
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,Calling_Party_Address_Blocking,1

使用GNU Awk它也可以更简单:

#!/usr/bin/awk -f
BEGIN { FS = "," }
{
    sub(/[ \t\r]*$/, "")
    key = $1 FS $2
    if (ARGIND == 1) {
        keys[++k] = key
    }
    ++validity[key]
    data[key] = data[key] substr($0, length(key) + 1)
}
ENDFILE {
    append = ""
    for (i = NF - 2; i > 0; --i) {
        append = append ",NA"
    }
    for (i = 1; i <= k; ++i) {
        key = keys[i]
        if (validity[key] < ARGIND) {
            data[key] = data[key] append
            ++validity[key]
        }
    }
}
END {
    for (i = 1; i <= k; ++i) {
        key = keys[i]
        print key data[key]
    }
}

用法:

gawk -f script.awk file1 file2 file3