awk在单列上完全连接2个文件

时间:2015-06-28 04:50:56

标签: linux bash shell unix awk

我有2个CSV文件,我想用AWK加入它们。

file1.csv:

{{1}}

file2.csv:

{{1}}

这是我想要的输出:

{{1}}

我想在文件1和文件2上进行完全连接,其中A1 = A2。 File2的行数多于file1。对于没有匹配列值的记录,将插入NULL值。

4 个答案:

答案 0 :(得分:3)

为简单起见,可以使用标准join实用程序。

注意:join需要排序输入,因此解决方案必须首先对输入进行排序

示例加入

tail -n +2 file1.csv | sort -k 1 1>file3.csv;
tail -n +2 file2.csv | sort -k 1 1>file4.csv;
paste -d, file1.csv file2.csv | head -n 1 1>output.txt;
join -a 1 -a 2 -t , -e NULL -1 1 -2 1 \
     -o 1.1,1.2,1.3,2.1,2.2,2.3,2.4 \
     file3.csv file4.csv 1>>output.txt;

<强>输出

A1,B1,C1,A2,D2,E2,F2
"apple",1,2,"apple",1,3,4
NULL,NULL,NULL,"mango",6,5,1
"orange",2,3,NULL,NULL,NULL,NULL
NULL,NULL,NULL,"peach",2,3,3
"pear",5,4,"pear",5,4,2

答案 1 :(得分:2)

您可以使用此awk

awk -F, 'FNR==1{if (NR==1)print "A1,B1,C1,A2,D2,E2,F2";next} 
         FNR==NR{a[$1]=$0;next}
         {print $0 FS (($1 in a)? a[$1]:"NULL,NULL,NULL,NULL"); delete a[$1]}
         END{for (i in a) print "NULL,NULL,NULL," a[i]}' file2.csv file1.csv
A1,B1,C1,A2,D2,E2,F2
"apple",1,2,"apple",1,3,4
"orange",2,3,NULL,NULL,NULL,NULL
"pear",5,4,"pear",5,4,2
NULL,NULL,NULL,"mango",6,5,1
NULL,NULL,NULL,"peach",2,3,3

答案 2 :(得分:1)

试试这个:

awk -F',' 'BEGIN{flag=2}NR==FNR{if(flag==2){head=$0;--flag;}else{a[$1]=$0}}
NR>FNR{if(flag==1){print head","$0;flag=0}else{if(a[$1]){print a[$1],$0;delete a[$1]}
else{print "NULL,NULL,NULL,"$0}}}END{for(i in a){if(a[i]){print a[i]",NULL,NULL,NULL,NULL"}}}' 
file1.csv file2.csv

输出:

A1,B1,C1,A2,D2,E2,F2
"apple",1,2 "apple",1,3,4
NULL,NULL,NULL,"peach",2,3,3
"pear",5,4 "pear",5,4,2
NULL,NULL,NULL,"mango",6,5,1
"orange",2,3,NULL,NULL,NULL,NULL

答案 3 :(得分:0)

为了完整起见,bash脚本也可以通过一些限制来解决这个问题。以下适用于该示例,但期望在每个输入文件中的相同行上匹配行。如果csv文件不是这种情况,则需要对文件进行简单的预排序。

这比竞争者更像是一种练习,但最终以与其他一些解决方案相同或更好的顺序满足条件。如果您有任何疑问,请告诉我们:

#!/bin/bash

declare -i l1=0   # lines in file 1
declare -i l2=0   # lines in file 2
declare -i a1s=0  # array 1 stride
declare -i a2s=0  # array 2 stride

while read -r line; do              ## fill array from file1
    a1+=( $(tr ',' ' ' <<<$line) )
    ((l1++))
done <"$1"

while read -r line; do              ## fill array from file2

    a2+=( $(tr ',' ' ' <<<$line) )
    ((l2++))

done <"$2"

a1s=$((${#a1[@]}/l1))   ## stride of array 1
a2s=$((${#a2[@]}/l2))   ## stride of array 2

[ $l1 -lt $l2 ] && lim=$l1 || lim=$l2   ## which has more rows?

for ((i = 0; i < lim; i++)); do         ## for common rows
    if [ $i -eq 0 -o ${a1[$((i*a1s))]} = ${a2[$((i*a2s))]} ]; then
        for ((j = 0; j < a1s; j++)); do
            [ $j -eq 0 ] && printf "%s" ${a1[$((i*a1s+j))]} || printf ",%s" ${a1[$((i*a1s+j))]}
        done
        for ((j = 0; j < a2s; j++)); do printf ",%s" ${a2[$((i*a2s+j))]}; done
        printf "\n"
    else
        for ((j = 0; j < a1s; j++)); do
            [ $j -eq 0 ] && printf "%s" ${a1[$((i*a1s+j))]} || printf ",%s" ${a1[$((i*a1s+j))]}
        done
        for ((j = 0; j < a2s; j++)); do printf ",NULL"; done
        printf "\n"
        for ((j = 0; j < a1s; j++)); do
            [ $j -eq 0 ] && printf "NULL" || printf ",NULL"
        done
        for ((j = 0; j < a2s; j++)); do printf ",%s" ${a2[$((i*a2s+j))]}; done
        printf "\n"
    fi
done

if [ $l1 -lt $l2 ]; then    ## for excess rows (longest row-wise)
    last=$l2
    for ((i = lim; i < last; i++)); do
        for ((j = 0; j < a1s; j++)); do
            [ $j -eq 0 ] && printf "NULL" || printf ",NULL"
        done
        for ((j = 0; j < a2s; j++)); do printf ",%s" ${a2[$((i*a2s+j))]}; done
        printf "\n"
    done
else
    last=$l1
    for ((i = lim; i < last; i++)); do
        for ((j = 0; j < a1s; j++)); do
            [ $j -eq 0 ] && printf "%s" ${a1[$((i*a1s+j))]} || printf ",%s" ${a1[$((i*a1s+j))]}
        done
        for ((j = 0; j < a2s; j++)); do printf ",NULL"; done
        printf "\n"
    done
fi

exit 0

<强>输出

$ bash ../read2redir.sh A1.txt A2.txt
A1,B1,C1,A2,D2,E2,F2
"apple",1,2,"apple",1,3,4
"orange",2,3,NULL,NULL,NULL,NULL
NULL,NULL,NULL,"peach",2,3,3
"pear",5,4,"pear",5,4,2
NULL,NULL,NULL,"mango",6,5,1