如何在不更改格式的情况下删除文件中的特定列

时间:2018-08-04 12:07:25

标签: awk

我的输入文件如下:

SL3.0ch00   maker_ITAG  exon    16480   16794   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  exon    16879   17940   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16480   16794   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16879   17940   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";

所需的输出:

SL3.0ch00   maker_ITAG  exon    16480   16794   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  exon    16879   17940   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16480   16794   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16879   17940   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";

我要删除“ gene_name“某些名称”;”从所有行归档。 我使用了以下内容:

awk '{prinf$13=$14=""; print $0}' input_file 

但是前几列的所有格式都被更改(空格代替制表符)。请帮助。任何其他命令或方法也可以。

3 个答案:

答案 0 :(得分:1)

使用awk:

    fileprivate func apiCall() -> Observable<Response> {
                let userID = UserModel.sharedInstance.uID

                return self.dateParameterTrigger
                    .observeOn(self.queue)
                    .flatMap{ para -> Observable<Response> in
                        let startDateStr = para.0
                        let endDateStr = para.1
                        return self.provider.rx.request(CustomerTarget.GetInvitedUserList(userID, 0, 20, startDateStr, endDateStr))
                               .asObservable()
                               .share(replay: 1, scope: SubjectLifetimeScope.forever)
                    }

            }

            func getSummery() -> Observable<CustomerListJSON> {

                return apiCall()
                    .flatMap{ event -> Observable<CustomerListJSON> in
                        let catchReturn = Observable.of(CustomerListJSON())
                        var baseModel:CustomerModel!

                            do{
                                let decoder = JSONDecoder()
                                baseModel = try decoder.decode(CustomerModel.self, from: event.data)
                            }catch let error {
                                print( error )
                                return catchReturn
                            }

                            guard let model = baseModel, let customerModel = model.returnValue else {
                                return catchReturn
                            }

                            return Observable.just(customerModel)
                }
            }

            func getCustomerList() -> Observable<[CustomerJSON]> {

                return self.apiCall()
                    .flatMap{ event -> Observable<[CustomerJSON]> in
                        let catchReturn = Observable.of([CustomerJSON]())
                        var baseModel:CustomerModel!
                        do{
                            let decoder = JSONDecoder()
                            baseModel = try decoder.decode(CustomerModel.self, from: event.data)
                        }catch let error {
                            print( error )
                            return catchReturn
                        }

                        guard let model = baseModel, let customerModel = model.returnValue, let customerList = customerModel.UserList else {
                            return catchReturn
                        }

                        return Observable.just(customerList)
                }
            }

使用sed:

awk 'BEGIN{FS=OFS=";"} {print $1,$2,$4,$5}' file

输出:

SL3.0ch00   maker_ITAG  exon    16480   16794   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  exon    16879   17940   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16480   16794   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16879   17940   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";

请参阅:The Stack Overflow Regular Expressions FAQ

答案 1 :(得分:1)

请尝试以下操作。(如果您的Input_file由TAB分隔,请使用-F"\t"

awk 'match($0,/ gene_name[^;]*/){print substr($0,1,RSTART-1) substr($0,RSTART+RLENGTH+1);next} 1' Input_file

现在也添加一种非线性形式的解决方案并进行解释。

awk '
match($0,/ gene_name[^;]*/){                               ##Using match function of awk where checking regex from keyword gene_name till semi colon.
  print substr($0,1,RSTART-1) substr($0,RSTART+RLENGTH+1)  ##Printing substring from 1st character to till RSTART-1 and then RSTART+RLENGTH+1 to till last, where RSTART and RLENGTH are out of the box keywords whose value will be SET when a regex match is found in match function.
  next                                                     ##next is out of box keyword which will skip all further statements from here.
}
1                                                          ##Mentioning 1 will print the lines which do not have match of above regex for gene_name one.
' Input_file                                               ##Mentioning Input_file name here.

答案 2 :(得分:1)

您有一些字段用制表符分隔,而另一些字段则用分号分隔,后跟可选的空格。您可以使用FS="\t|; ?"告诉awk进行拆分,这将正确识别您的字段,但是每个字段周围的特定分隔符将不会保留,您稍后需要它们将记录放回去。这就是GNU awks split()函数被赋予第4个arg的原因,因此它可以保存字段和分隔符。在您的情况下,您可以将其用作:

nf = split($0,flds,/\t|; ?/,seps)

看看对输入的第一条记录有什么作用

$ cat tst.awk
{
    nf = split($0,flds,/\t|; ?/,seps)
}
NR == 1 {
    printf "$0=<%s>\n", $0
    for (i=1; i<=nf; i++) {
        printf "  flds[%d] = <%s>\n", i, flds[i]
        printf "  seps[%d] = <%s>\n", i, seps[i]
    }
}

$ awk -f tst.awk file
$0=<SL3.0ch00   maker_ITAG      exon    16480   16794   .       +       .       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";>
  flds[1] = <SL3.0ch00>
  seps[1] = <   >
  flds[2] = <maker_ITAG>
  seps[2] = <   >
  flds[3] = <exon>
  seps[3] = <   >
  flds[4] = <16480>
  seps[4] = <   >
  flds[5] = <16794>
  seps[5] = <   >
  flds[6] = <.>
  seps[6] = <   >
  flds[7] = <+>
  seps[7] = <   >
  flds[8] = <.>
  seps[8] = <   >
  flds[9] = <transcript_id "mRNA:Solyc00g005000.3.1">
  seps[9] = <; >
  flds[10] = <gene_id "gene:Solyc00g005000.3">
  seps[10] = <; >
  flds[11] = <gene_name "Solyc00g005000.3">
  seps[11] = <; >
  flds[12] = <gene_biotype "protein_coding">
  seps[12] = <;>
  flds[13] = <>
  seps[13] = <>

看看您不仅可以访问flds[]数组中的每个字段,而且还可以访问seps[]数组中的每个字段周围的分隔符?因此,要删除字段,只需将数组中的相应元素设置为null并重新组合记录:

$ cat tst.awk
{
    nf = split($0,flds,/\t|; ?/,seps)

    flds[11] = seps[11] = ""

    $0 = join(nf,flds,seps)

    print
}
function join(n,f,s,   i,o) {for (i=1;i<=n;i++) o=o f[i] s[i]; return o}

$ awk -f tst.awk file
SL3.0ch00       maker_ITAG      exon    16480   16794   .       +       .       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00       maker_ITAG      exon    16879   17940   .       +       .       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00       maker_ITAG      CDS     16480   16794   .       +       0       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00       maker_ITAG      CDS     16879   17940   .       +       0       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";