我有两个文本文件
cat A.txt
10,1,1,"ABC"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1,"S2"
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,3,"ABC"
cat B.txt
10,1,1,"ABC1"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,4,"bokaj"
我想通过从两个文本文件中读取来找到丢失的字段,并通过“”在两个文件中填写丢失的字段,然后保存到两个新的修改后的文件中
A1.txt是A.txt的修改版本
猫A1.txt
10,1,1,"ABC"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1,"S2"
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,3,"ABC"
10,2,4," "
B1.txt是B.txt的修改版本
猫B1.txt
10,1,1,"ABC1"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1," "
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,3," "
10,2,4,"bokaj"
确保A1.txt中的行总数与B1.txt中的行总数相同,很抱歉,由于缺少命令,我无法发布我的MWE,因为我是bash的新手,您的解释可能是帮我学习这个。
这是我到目前为止尝试过的MWE
#!/bin/bash
cut -d ',' -f1,2,3 A.txt > A1.txt
cut -d ',' -f1,2,3 B.txt > B1.txt
## Command to print contents which are in B1.txt but not in A1.txt
A=`awk 'NR==FNR{a[$0];next} !($0 in a)' A1.txt B1.txt`
echo $A,'" "' >> A.txt
sort A.txt
## Command to print contents which are in A1.txt but not in B1.txt
B=`awk 'NR==FNR{a[$0];next} !($0 in a)' B1.txt A1.txt`
echo $B,'" "' >> B.txt
sort B.txt
答案 0 :(得分:2)
所以:
," "
以下代码:
cat <<EOF >A.txt
10,1,1,"ABC"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1,"S2"
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,3,"ABC"
EOF
cat <<EOF >B.txt
10,1,1,"ABC1"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,4,"bokaj"
EOF
# extract unique lines from first and second file
# hide lines common in both files
comm -3 <(
# extract 3 fields from A.txt and sort
< A.txt \
cut -d, -f1-3 |
sort
) <(
# extract 3 fields from B.txt and sort
< B.txt \
cut -d, -f1-3 |
sort
) |
# suffix with `," "` string
sed 's/$/," "/' |
# split the stream
tee >(
# extract lines unique to the first file, ie. A.txt file
grep -v $'^\t' |
# join the stream with the content of B.txt file
# also note that lines from stdin are preferred in sorting order
# over the same lines from B.txt file
# sort it using first 3 fields. Preserve sorting from B.txt file
# and put the output into B1.txt
sort -s -t, -k1,3 - B.txt \
> B1.txt
) |
# extract lines unique to the second file, ie. B.txt file
grep $'^\t' | cut -f2 |
# join the output with A.txt file
# sort it using first 3 fields, preserve sorting, put into A1.txt
sort -s -t, -k1,3 - A.txt \
> A1.txt
# verbose output
set -x
cat B1.txt
cat A1.txt
将输出:
++ cat B1.txt
10,1,1,"ABC1"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1," "
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,3," "
10,2,4,"bokaj"
++ cat A1.txt
10,1,1,"ABC"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1,"S2"
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,3,"ABC"
10,2,4," "
在repl.it上进行了测试。
comm
的输出很奇怪,因为后缀没有第一个文件特有的后缀行。因此,我学会了grep
列表以提取唯一行,以便分别从A.txt或B.txt文件中获得唯一行,分别是grep -v $'^\t'
或grep $'^\t' | cut -d2
。
两次运行comm
时,此脚本可能更冗长,更像“线性”(如果没有tee
,则不知道如何调用它):
comm -13 <(
< A.txt \
cut -d, -f1-3 |
sort
) <(
< B.txt \
cut -d, -f1-3 |
sort
) |
sed 's/$/," "/' |
sort -s -t, -k1,3 - A.txt \
> A1.txt
comm -23 <(
< A.txt \
cut -d, -f1-3 |
sort
) <(
< B.txt \
cut -d, -f1-3 |
sort
) |
sed 's/$/," "/' |
sort -s -t, -k1,3 - B.txt \
> B1.txt
还有一些整洁的4班轮:
comm -3 <(cut -d, -f1-3 A.txt | sort) <(cut -d, -f1-3 B.txt | sort) |
sed 's/$/," "/' |
tee >(grep -v $'^\t' | sort -s -t, -k1,3 - B.txt > B1.txt) |
grep $'^\t' | cut -f2 | sort -s -t, -k1,3 - A.txt > A1.txt
答案 1 :(得分:1)
这是一个awk
脚本,可以执行任务。
script.awk
FNR == NR { # read first input file
fileNames[1] = FILENAME".1";
lines[FNR",1"] = $0;
file1[$0];
file1count = FNR;
next;
}
{ # read scond input file
fileNames[2] = FILENAME".1";
lines[FNR",2"] = $0;
file2[$0];
}
END {
printf "" > fileNames[1]; # clean file 1
printf "" > fileNames[2]; # clean file 2
maxFileLen = file1count > FNR ? file1count : FNR;
for (i = 1; i <= maxFileLen; i++) { # for each line in file 1
print lines[i",1"] >> fileNames[1]; # overwrite file 1
if (lines[i",1"] in file2 == 0) print replaceField(lines[i",1"]) >> fileNames[2];
if (lines[i",2"] in file1 == 0) print replaceField(lines[i",2"]) >> fileNames[1];
print lines[i",2"] >> fileNames[2]; # overwrite file 2
}
}
function replaceField(inpStr) {
sub(/"[^"]+"/, "\" \"", inpStr);
return inpStr;
}
运行脚本
awk -f script.awk A.txt B.txt
输出文件将.1
附加到文件名
输出A.txt.1
10,1,1,"ABC"
10,1,1," "
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1,"S2"
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,4," "
10,2,3,"ABC"
输出B.txt.1
10,1,1," "
10,1,1,"ABC1"
10,1,2,"S1"
10,1,2,"ABC"
10,1,3,"baba"
10,2,1," "
10,2,1,"asd"
10,2,2,"S3"
10,2,2,"dkkd"
10,2,4,"bokaj"
10,2,3," "
输出与问题描述匹配。但不是示例输出。
答案 2 :(得分:1)
对于数组数组,sorted_in,gensub()和ARGIND,使用GNU awk:
$ cat tst.awk
BEGIN { FS=OFS="," }
{ keySet[$1][$2][$3] = key = $1 FS $2 FS $3 }
ARGIND==1 {
instNr = ++tots[key]
}
ARGIND==2 {
instNr = ++cnt2[key]
if (instNr > tots[key]) {
tots[key] = instNr
}
}
{ vals[ARGIND,key,instNr] = $NF }
END {
PROCINFO["sorted_in"] = "@ind_num_asc"
for (k1 in keySet) {
for (k2 in keySet[k1]) {
for (k3 in keySet[k1][k2]) {
keys[++numKeys] = keySet[k1][k2][k3]
}
}
}
prt(1)
prt(2)
}
function prt(fileNr, fname, keyNr, key, instNr, idx, val) {
fname = gensub(/\.[^.]+$/,"",1,ARGV[fileNr]) "1.txt"
for (keyNr=1; keyNr<=numKeys; keyNr++) {
key = keys[keyNr]
for (instNr=1; instNr<=tots[key]; instNr++) {
idx = fileNr SUBSEP key SUBSEP instNr
if ( !(idx in vals) ) {
print key, "\" \"" "\t> " fname
}
}
for (instNr=1; instNr<=tots[key]; instNr++) {
idx = fileNr SUBSEP key SUBSEP instNr
if ( idx in vals ) {
print key, vals[idx] "\t> " fname
}
}
}
}
。
$ awk -f tst.awk A.txt B.txt
10,1,1,"ABC" > A1.txt
10,1,2,"S1" > A1.txt
10,1,2,"ABC" > A1.txt
10,1,3,"baba" > A1.txt
10,2,1,"S2" > A1.txt
10,2,1,"asd" > A1.txt
10,2,2,"S3" > A1.txt
10,2,2,"dkkd" > A1.txt
10,2,3,"ABC" > A1.txt
10,2,4," " > A1.txt
10,1,1,"ABC1" > B1.txt
10,1,2,"S1" > B1.txt
10,1,2,"ABC" > B1.txt
10,1,3,"baba" > B1.txt
10,2,1," " > B1.txt
10,2,1,"asd" > B1.txt
10,2,2,"S3" > B1.txt
10,2,2,"dkkd" > B1.txt
10,2,3," " > B1.txt
10,2,4,"bokaj" > B1.txt
当您对实际写入输出文件的结果感到满意时,将"\t> "
行上的print
更改为>
。