我想过滤CSV文件,删除第一列中只有不同的行,但保留块的第一行和最后一行使用"重复" -lines
我已经编写了一个执行我想要的小bash脚本:
nogo="this line is a nogo, replace it!"
prev2=$nogo
prev1=$nogo
function printgo {
if [ "$1" != "$nogo" ]
then
echo "$1"
fi
}
while read line ; do
if [ "$prev1" = "$nogo" ]
then
prev1=$line
else
prevp=$(echo "$prev1" | sed -e 's/[0-9 :-]*;//')
linep=$(echo "$line" | sed -e 's/[0-9 :-]*;//')
if [ "$prevp" != "$linep" ]
then
printgo "$prev2"
printgo "$prev1"
prev2=$nogo
else
prev2p=$(echo "$prev1" | sed -e 's/[0-9 :-]*;//')
if [[ "$prev2" == "$nogo" || "$prevp" != "$prev2p" ]]
then
prev2=$prev1
fi
fi
prev1=$line
fi
done < <(cat $1)
printgo "$prev2"
printgo "$prev1"
现在我的问题是:如何改善这件事的表现 - 没有太多的工作;)
答案 0 :(得分:0)
我怀疑awk会更快:
awk -F ';' '
{key = $0; sub("^[^" FS "]+" FS, "", key)}
key != prev_key {
if (prev_line != block_start_line) print prev_line
block_start_line = $0
print
}
{prev_key = key; prev_line = $0}
END {if (prev_line != block_start_line) print prev_line}
' << END
1;2;3;4
2;2;3;4
3;2;3;4
4;2;3;4
9;8;7;6
8;8;7;6
1;2;3;4
1;8;7;6
2;8;7;6
3;8;7;6
END
1;2;3;4
4;2;3;4
9;8;7;6
8;8;7;6
1;2;3;4
1;8;7;6
3;8;7;6