忽略引号中的分隔符并在csv文件中动态排除列

时间:2017-07-24 03:53:05

标签: bash shell csv awk

我有awk命令用| sperator读取csv文件。我使用此命令作为我的shell脚本的一部分,其中要排除的列将从输出中删除。列的列表输入为1 2 3

命令参考:http://wiki.bash-hackers.org/snipplets/awkcsv

 awk -v FS='"| "|^"|"$' '{for i in $test; do $(echo $i=""); done  print }' test.csv

$test is 1 2 3

我想在$1="" $2="" $3=""所有列前打印print。我收到此错误

awk: {for i in $test; do $(echo $i=""); done  {print }
awk:      ^ syntax error

此命令正常工作,可打印所有列

awk -v FS='"| "|^"|"$' '{print }' test.csv

档案1

"first"| "second"| "last"
"fir|st"| "second"| "last"
"firtst one"| "sec|ond field"| "final|ly"

如果我想动态排除第2列和第3列

,则预期输出
first
fir|st
firtst one

我需要帮助才能正确保持for循环。

3 个答案:

答案 0 :(得分:3)

使用GNU awk for FPAT:

$ awk -v FPAT='"[^"]+"' '{print $1}' file
"first"
"fir|st"
"firtst one"

$ awk -v flds='1' -v FPAT='"[^"]+"' 'BEGIN{n=split(flds,f,/ /)} {for (i=1;i<=n;i++) printf "%s%s", $(f[i]), (i<n?OFS:ORS)}' file
"first"
"fir|st"
"firtst one"

$ awk -v flds='2 3' -v FPAT='"[^"]+"' 'BEGIN{n=split(flds,f,/ /)} {for (i=1;i<=n;i++) printf "%s%s", $(f[i]), (i<n?OFS:ORS)}' file
"second" "last"
"second" "last"
"sec|ond field" "final|ly"

$ awk -v flds='3 1' -v FPAT='"[^"]+"' 'BEGIN{n=split(flds,f,/ /)} {for (i=1;i<=n;i++) printf "%s%s", $(f[i]), (i<n?OFS:ORS)}' file
"last" "first"
"last" "fir|st"
"final|ly" "firtst one"

如果您不希望输出字段以空白字符分隔,请将OFS设置为-v OFS='whatever'所需的任何内容。如果你想摆脱周围的引号你可以使用gensub()(因为我们无论如何都使用gawk)或者每个字段使用substr(),例如:

$ awk -v OFS=';' -v flds='1 3' -v FPAT='"[^"]+"' 'BEGIN{n=split(flds,f,/ /)} {for (i=1;i<=n;i++) printf "%s%s", substr($(f[i]),2,length($(f[i]))-2), (i<n?OFS:ORS)}' file
first;last
fir|st;last
firtst one;final|ly

$ awk -v OFS=';' -v flds='1 3' -v FPAT='"[^"]+"' 'BEGIN{n=split(flds,f,/ /)} {for (i=1;i<=n;i++) printf "%s%s", gensub(/"/,"","g",$(f[i])), (i<n?OFS:ORS)}' file
first;last
fir|st;last
firtst one;final|ly

答案 1 :(得分:2)

在GNU awk中(对于FPAT):

$ test="2 3"                       # fields to exclude in bash var $test
$ awk -v t="$test" '               # taken to awk var t
BEGIN {                            # first
    FPAT="([^|]+)|( *\"[^\"]+\")"  # instead of FS, use FPAT
    split(t,a," ")                 # process t to e:
    for(i in a)                    # a[1]=2 -> e[2], etc.
        e[a[i]]
} 
{
    for(i=1;i<=NF;i++)             # for each field
        if((i in e)==0) {          # if field # not in e
            gsub(/^\"|\"$/,"",$i)  # remove leading and trailing "
            b=b (b==""?"":OFS) $i  # put to buffer b
        }
    print b; b=""                  # putput and reset buffer
}' file
first
fir|st
firtst one

FPAT用作FS无法处理引号中的分隔符。

答案 2 :(得分:1)

Vikram,如果您的实际Input_file与显示的示例Input_file的DITTO相同,那么以下内容可能对您有所帮助。我也将在这里简单地添加解释(使用GNU awk 3.1.7对旧版本的awk进行测试)。

awk -v num="2,3" 'BEGIN{
    len=split(num, val,",")
                  }
   {while($0){
       match($0,/.[^"]*/);
       if(substr($0,RSTART,RLENGTH+1) && substr($0,RSTART,RLENGTH+1) !~ /\"\| \"/ && substr($0,RSTART,RLENGTH+1) !~ /^\"$/ && substr($0,RSTART,RLENGTH+1) !~ /^\" \"$/){
       array[++i]=substr($0,RSTART,RLENGTH+1)
       };
       $0=substr($0,RLENGTH+1);
   };
   for(l=1;l<=len;l++){
       delete array[val[l]]
       };
   for(j=1;j<=length(array);j++){
       if(array[j]){
          gsub(/^\"|\"$/,"",array[j]);
          printf("%s%s",array[j],j==length(array)?"":" ")
                   }
       };
   print "";
   i="";
   delete array
   }'   Input_file

EDIT1:此处也添加了解释代码。

awk -v num="2,3" 'BEGIN{ ##creating a variable named num whose value is comma seprated values of fields which you want to nullify, starting BEGIN section here.
    len=split(num, val,",") ##creating an array named val here whose delimiter is comma and creating len variable whose value is length of array val here.
                  }
   {while($0){ ##Starting a while loop here which will run for a single line till that line is NOT getting null.
       match($0,/.[^"]*/);##using match functionality which will look for matches from starting to till a " comes into match.
       if(substr($0,RSTART,RLENGTH+1) && substr($0,RSTART,RLENGTH+1) !~ /\"\| \"/ && substr($0,RSTART,RLENGTH+1) !~ /^\"$/ && substr($0,RSTART,RLENGTH+1) !~ /^\" \"$/){##So RSTATR and RLENGTH are the variables which will be set when a regex is having a match in line/variable passed into match function. In this if condition I am checking 1st: value of substring of RSTART,RLENGTH+1 should not be NULL. 2nd: Then checking this substring should not be having " pipe space ". 3rd condition: Checking if substring is NOT equal to a string which starts from " and ending with it. 4th condition: Checking here if substring is NOT equal to ^" space "$, if all conditions are TRUE then do following actions.
       array[++i]=substr($0,RSTART,RLENGTH+1) ##creating an array named array whose index is variable i with increasing value of i and its value is substring of RSTART to till RLENGTH+1.
       };
       $0=substr($0,RLENGTH+1);##Now removing the matched part from current line which will decrease the length of line and avoid the while loop to become as infinite.
   };
   for(l=1;l<=len;l++){##Starting a loop here once while above loop is done which runs from starting of variable l=1 to value of len.
       delete array[val[l]] ##Deleting here those values which we want to REMOVE from OPs request, so removing here.
       };
   for(j=1;j<=length(array);j++){##Start a for loop from the value of j=1 till the value of lengthh of array.
       if(array[j]){ ##Now making sure array value whose index is j is NOT NULL, if yes then perform following statements.
          gsub(/^\"|\"$/,"",array[j]); ##Globally substituting starting " and ending " with NULL in value of array value.
          printf("%s%s",array[j],j==length(array)?"":" ") ##Now printing the value of array and secondly printing space or null depending upon if j value is equal to array length then print NULL else print space. It is because we don not want space at the last of the line.
                   }
       };
   print ""; ##Because above printf will NOT print a new line, so printing a new line.
   i=""; ##Nullifying variable i here.
   delete array ##Deleting array here.
   }' Input_file  ##Mentioning Input_file here.