我在文本文件中有以下定界内容:
col1|col2|col3|col4|col5|col6
id1|2314|jack|nov-12|water|3294
id2|8322|john|dec-01|sand|2334
id1|2314|jill|nov-12|oil|3294
id1|2314|jim|nov-12|ether|3294
id3|6775|mike|jan-13|dust|9348
我想合并第一列匹配的第三列和第五列的内容。输出应如下所示:
col1|col2|col3-1|col3-2|col3-3|col4|col5-1|col5-2|col5-3|col6
id1|2314|jack|jill|jim|nov-12|water|oil|ether|3294
id2|8322|john|||dec-01|sand|||2334
id3|6775|mike|||jan-13|dust|||9348
输出中的行和列顺序与输入的顺序无关紧要。
编辑1:最多可以进行5次合并,此后的任何内容都应以逗号添加到第5列,例如
col1|col2|col3-1|col3-2|col3-3|col3-4|col3-5|col4|col5-1|col5-2|col5-3|col5-4|col5-5|col6
id1|2314|jack|jill|jim|val3-4|val3-5,val3-6|nov12|water|oil|ether|val5-4|val5-5,val5-6|3294
编辑2:作为一个旁注,实际文件包含14列,第9列和第13列需要合并。我能够根据下面的@Allan答案进行调整。另外,正如我在@ RavinderSingh13的评论中提到的那样,输出由cron作业自动处理,因此合并后的列数必须固定为每列5个。
答案 0 :(得分:2)
请尝试以下操作,此解决方案将逐行显示字段(不硬编码3个列数),还将在col3和col5中输入最大列标题数。
awk -F'|' '
FNR==NR{
b[$1]=$1 in a?b[$1] FS $3:$3
c[$1]=$1 in a?c[$1] FS $5:$5
num1=split(b[$1],array1,"|")
num2=split(c[$1],array2,"|")
a[$1]=$1
num=num1>num2?num1:num2>prev?num2:prev
prev=num
next
}
FNR==1{
$3=$5=""
while(++count<=num){
$3=$3 OFS "col3-"count
}
$5=$3
gsub("col3","col5",$5)
print
count=""
next
}
!d[$1]++ && FNR>1{
num1=split(b[$1],array1,"|")
num2=split(c[$1],array2,"|")
while(num1++<=num){
b[$1]=b[$1] OFS
}
while(num2++<=num){
c[$1]=c[$1] OFS
}
$3=b[$1]
$5=c[$1]
print
}' Input_file OFS="|" Input_file
答案 1 :(得分:1)
答案的第一要素(大小固定为3且丑陋的解决方案时):
awk 'BEGIN{FS=OFS="|"; print "col1|col2|col3-1|col3-2|col3-3|col4|col5-1|col5-2|col5-3|col6"}NR>1{col2[$1]=$2;col4[$1]=$4;col6[$1]=$6;if(length(col3[$1])==0){col3[$1]=$3}else{col3[$1]=col3[$1]"|"$3}if(length(col5[$1])==0){col5[$1]=$5}else{col5[$1]=col5[$1]"|"$5}}END{n=asorti(col3,oArray);for(i=1; i<=n;i++){if(index(col3[oArray[i]],"|")==0){col3[oArray[i]]=col3[oArray[i]]"||";col5[oArray[i]]=col5[oArray[i]]"||";};print oArray[i],col2[oArray[i]],col3[oArray[i]],col4[oArray[i]],col5[oArray[i]],col6[oArray[i]]}}' csvToMerge.in
col1|col2|col3-1|col3-2|col3-3|col4|col5-1|col5-2|col5-3|col6
id1|2314|jack|jill|jim|nov-12|water|oil|ether|3294
id2|8322|john|||dec-01|sand|||2334
id3|6775|mike|||jan-13|dust|||9348
更具可读性:
$ cat awkprof.out
# gawk profile, created Fri Dec 14 13:12:34 2018
# BEGIN rule(s)
BEGIN {
1 FS = OFS = "|"
1 print "col1|col2|col3-1|col3-2|col3-3|col4|col5-1|col5-2|col5-3|col6"
}
# Rule(s)
6 NR > 1 { # 5
5 col2[$1] = $2
5 col4[$1] = $4
5 col6[$1] = $6
5 if (length(col3[$1]) == 0) { # 3
3 col3[$1] = $3
2 } else {
2 col3[$1] = col3[$1] "|" $3
}
5 if (length(col5[$1]) == 0) { # 3
3 col5[$1] = $5
2 } else {
2 col5[$1] = col5[$1] "|" $5
}
}
# END rule(s)
END {
1 n = asorti(col3, oArray)
3 for (i = 1; i <= n; i++) {
3 if (index(col3[oArray[i]], "|") == 0) { # 2
2 col3[oArray[i]] = col3[oArray[i]] "||"
2 col5[oArray[i]] = col5[oArray[i]] "||"
}
3 print oArray[i], col2[oArray[i]], col3[oArray[i]], col4[oArray[i]], col5[oArray[i]], col6[oArray[i]]
}
}
美丽的解决方案
通过计算col3
中元素出现的最大数量来动态构造col5
和col1
脚本csvmerge.awk
#function definitions
#function used to add the "|" at the end of col3, col5 when the element does not reach MAX number of occurences
function paddingfunction(MAX,input){
output=input;
gsub(/[^|]/,"",output);
l=length(output);
tmp=""
for(u=l; u<MAX-1;u++)
{
tmp=tmp OFS;
}
return input""tmp;
}
#function used to generate nice header
function headerAppender(inputString){
tmp=inputString;
for(i=1;i<=MAX;i++){
printf tmp""i OFS
}
}
BEGIN{
#Generate the header line
FS=OFS="|";
printf "col1" OFS "col2" OFS;
headerAppender("col3-");
printf "col4" OFS; headerAppender("col5-");
print "col6"
}
NR>1{
#save all the cells and concat the cells when col1 is the same
col2[$1]=$2;
col4[$1]=$4;
col6[$1]=$6;
if(length(col3[$1])==0){
col3[$1]=$3
}
else{
col3[$1]=col3[$1] OFS $3
}
if(length(col5[$1])==0){
col5[$1]=$5
}
else{
col5[$1]=col5[$1] OFS $5
}
}
END{
#sort the array
n=asorti(col3,oArray);
#print the cells
for(i=1; i<=n;i++){
print oArray[i],col2[oArray[i]],paddingfunction(MAX,col3[oArray[i]]),col4[oArray[i]],paddingfunction(MAX,col5[oArray[i]]),col6[oArray[i]];
}
}
输入1:(要分组的6个元素)
$ cat csvToMerge.in
col1|col2|col3|col4|col5|col6
id1|2314|jack|nov-12|water|3294
id2|8322|john|dec-01|sand|2334
id1|2314|jill|nov-12|oil|3294
id1|2314|jim|nov-12|ether|3294
id3|6775|mike|jan-13|dust|9348
id4|6776|mik1|jan-14|dast|9344
id4|6776|mik2|jan-14|dest|9344
id4|6776|mik3|jan-14|dist|9344
id4|6776|mik4|jan-14|dost|9344
id4|6776|mik5|jan-14|dst|9344
id4|6776|mik6|jan-14|dut|9344
input2 (要分组的5个元素)
$ cat csvToMerge2.in
col1|col2|col3|col4|col5|col6
id1|2314|jack|nov-12|water|3294
id2|8322|john|dec-01|sand|2334
id1|2314|jill|nov-12|oil|3294
id1|2314|jim|nov-12|ether|3294
id3|6775|mike|jan-13|dust|9348
id4|6776|mik1|jan-14|dast|9344
id4|6776|mik2|jan-14|dest|9344
id4|6776|mik3|jan-14|dist|9344
id4|6776|mik4|jan-14|dost|9344
id4|6776|mik5|jan-14|dst|9344
输出1:
$ awk -f csvmerge.awk -v MAX=`awk -F'|' ' {tot[$1]++}END{tmp=""; for (i in tot){if(tot[i]>tmp){tmp=tot[i]}}; print tmp; } ' csvToMerge.in` csvToMerge.in
col1|col2|col3-1|col3-2|col3-3|col3-4|col3-5|col3-6|col4|col5-1|col5-2|col5-3|col5-4|col5-5|col5-6|col6
id1|2314|jack|jill|jim||||nov-12|water|oil|ether||||3294
id2|8322|john||||||dec-01|sand||||||2334
id3|6775|mike||||||jan-13|dust||||||9348
id4|6776|mik1|mik2|mik3|mik4|mik5|mik6|jan-14|dast|dest|dist|dost|dst|dut|9344
输出2:
$ awk -f csvmerge.awk -v MAX=`awk -F'|' ' {tot[$1]++}END{tmp=""; for (i in tot){if(tot[i]>tmp){tmp=tot[i]}}; print tmp; } ' csvToMerge2.in` csvToMerge2.in
col1|col2|col3-1|col3-2|col3-3|col3-4|col3-5|col4|col5-1|col5-2|col5-3|col5-4|col5-5|col6
id1|2314|jack|jill|jim|||nov-12|water|oil|ether|||3294
id2|8322|john|||||dec-01|sand|||||2334
id3|6775|mike|||||jan-13|dust|||||9348
id4|6776|mik1|mik2|mik3|mik4|mik5|jan-14|dast|dest|dist|dost|dst|9344
注意:
-v MAX=`awk -F'|' ' {tot[$1]++}END{tmp=""; for (i in tot){if(tot[i]>tmp){tmp=tot[i]}}; print tmp; } ' csvToMerge.in`
这将在变量MAX
中保存要分组的最大出现次数,在您的情况下,最大值为5,但是您可以想象在其他情况下需要对更多元素进行分组。
答案 2 :(得分:0)
$ awk -f merge_fields.awk <(perl join.pl <(sort -t'|' -k1 data.txt))
id1|2314|jack|jill|jim|nov-12|water|oil|ether|3294
id2|8322|john|dec-01|sand|2334
id3|6775|mike|jan-13|dust|9348
join.pl
use v5.14;
readline(<>);
my @queue = ();
while (<>) {
chomp and my @fields = split /\|/;
say join('|', @queue) and @queue = ()
if (@queue and @queue[0] ne @fields[0]);
push(@queue, @fields);
}
say join('|', @queue) if @queue;
merge_fields.awk
BEGIN { OFS=FS="|" }
NF > 6 {
for (i = 6 + 1; i < NF; i++) {
if ($i ~ $1) {
$3 = $3 OFS $(i+2)
$5 = $5 OFS $(i+4)
}
}
}
{ print $1,$2,$3,$4,$5,$6 }
答案 3 :(得分:0)
这里是使用Perl的另一种解决方案。它最多为第3列和第5列打印5个元素,并在col6之后打印其余元素。只需添加“ xx”和“ yy”作为默认值,以便可以在输出中查看
脚本:
/tmp> cat csv_35col.ksh
perl -F"/\|/" -ane '
chomp($F[5]);
$id=$F[0];
if($.>1) {
if( $id ~~ @names )
{
@t3=@{ $kv3{$id} }; @t5=@{ $kv5{$id} };
push(@t3,$F[2]); push(@t5,$F[4]);
$kv3{$id}=[ @t3 ]; $kv5{$id}=[ @t5 ];
}
else
{
push(@names,$id); $kv{$id}=[ @F[0,1,3,5] ];
$kv3{$id}=[ @F[2] ]; $kv5{$id}=[ @F[4] ];
}
}
END {
$d="|";
for (1..6) { if($_==5 || $_==3) { $x=$_; for (1..5) { printf("%s|","col$x-$_")}} else { printf("%s|","col$_")} }
for my $x (@names) {
@n=@{$kv{$x}}; @n3=@{$kv3{$x}}; @n5=@{$kv5{$x}};
for (0..4) { $n3[$_]= $n3[$_] ? $n3[$_] : "xx"; $n5[$_]=$n5[$_]? $n5[$_] : "yy"; }
print "\n".join($d,@n[0,1],@n3[0..4],${n[2]},@n5[0..4],${n[3]},@n3[5..$#n3],@n5[5..$#n5]);
}
print "\n";
}
' $1
/tmp>
输入:
/tmp> cat jimw.csv
col1|col2|col3|col4|col5|col6
id1|2314|jack|nov-12|water|3294
id2|8322|john|dec-01|sand|2334
id1|2314|jill|nov-12|oil|3294
id1|2314|jim|nov-12|ether|3294
id3|6775|mike|jan-13|dust|9348
/tmp> cat jimw2.csv
col1|col2|col3|col4|col5|col6
id1|2314|jack|nov-12|water|3294
id2|8322|john|dec-01|sand|2334
id1|2314|jill|nov-12|oil|3294
id1|2314|jim|nov-12|ether|3294
id3|6775|mike|jan-13|dust|9348
id4|6776|mik1|jan-14|dast|9344
id4|6776|mik2|jan-14|dest|9344
id4|6776|mik3|jan-14|dist|9344
id4|6776|mik4|jan-14|dost|9344
id4|6776|mik5|jan-14|dst|9344
id4|6776|mik6|jan-14|dut|9344
/tmp>
结果:
/tmp> csv_35col.ksh jimw.csv
col1|col2|col3-1|col3-2|col3-3|col3-4|col3-5|col4|col5-1|col5-2|col5-3|col5-4|col5-5|col6|
id1|2314|jack|jill|jim|xx|xx|nov-12|water|oil|ether|yy|yy|3294
id2|8322|john|xx|xx|xx|xx|dec-01|sand|yy|yy|yy|yy|2334
id3|6775|mike|xx|xx|xx|xx|jan-13|dust|yy|yy|yy|yy|9348
/tmp>
/tmp> csv_35col.ksh jimw2.csv
col1|col2|col3-1|col3-2|col3-3|col3-4|col3-5|col4|col5-1|col5-2|col5-3|col5-4|col5-5|col6|
id1|2314|jack|jill|jim|xx|xx|nov-12|water|oil|ether|yy|yy|3294
id2|8322|john|xx|xx|xx|xx|dec-01|sand|yy|yy|yy|yy|2334
id3|6775|mike|xx|xx|xx|xx|jan-13|dust|yy|yy|yy|yy|9348
id4|6776|mik1|mik2|mik3|mik4|mik5|jan-14|dast|dest|dist|dost|dst|9344|mik6|dut
/tmp>