Question

我的输入文件如下：

SL3.0ch00   maker_ITAG  exon    16480   16794   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  exon    16879   17940   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16480   16794   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16879   17940   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";

所需的输出：

SL3.0ch00   maker_ITAG  exon    16480   16794   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  exon    16879   17940   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16480   16794   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16879   17940   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";

我要删除“ gene_name“某些名称”；”从所有行归档。我使用了以下内容：

awk '{prinf$13=$14=""; print $0}' input_file

但是前几列的所有格式都被更改（空格代替制表符）。请帮助。任何其他命令或方法也可以。

Answer 1

使用awk：

    fileprivate func apiCall() -> Observable<Response> {
                let userID = UserModel.sharedInstance.uID

                return self.dateParameterTrigger
                    .observeOn(self.queue)
                    .flatMap{ para -> Observable<Response> in
                        let startDateStr = para.0
                        let endDateStr = para.1
                        return self.provider.rx.request(CustomerTarget.GetInvitedUserList(userID, 0, 20, startDateStr, endDateStr))
                               .asObservable()
                               .share(replay: 1, scope: SubjectLifetimeScope.forever)
                    }

            }

            func getSummery() -> Observable<CustomerListJSON> {

                return apiCall()
                    .flatMap{ event -> Observable<CustomerListJSON> in
                        let catchReturn = Observable.of(CustomerListJSON())
                        var baseModel:CustomerModel!

                            do{
                                let decoder = JSONDecoder()
                                baseModel = try decoder.decode(CustomerModel.self, from: event.data)
                            }catch let error {
                                print( error )
                                return catchReturn
                            }

                            guard let model = baseModel, let customerModel = model.returnValue else {
                                return catchReturn
                            }

                            return Observable.just(customerModel)
                }
            }

            func getCustomerList() -> Observable<[CustomerJSON]> {

                return self.apiCall()
                    .flatMap{ event -> Observable<[CustomerJSON]> in
                        let catchReturn = Observable.of([CustomerJSON]())
                        var baseModel:CustomerModel!
                        do{
                            let decoder = JSONDecoder()
                            baseModel = try decoder.decode(CustomerModel.self, from: event.data)
                        }catch let error {
                            print( error )
                            return catchReturn
                        }

                        guard let model = baseModel, let customerModel = model.returnValue, let customerList = customerModel.UserList else {
                            return catchReturn
                        }

                        return Observable.just(customerList)
                }
            }

使用sed：

awk 'BEGIN{FS=OFS=";"} {print $1,$2,$4,$5}' file

输出：

SL3.0ch00   maker_ITAG  exon    16480   16794   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  exon    16879   17940   .   +   .   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16480   16794   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00   maker_ITAG  CDS 16879   17940   .   +   0   transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";

请参阅：The Stack Overflow Regular Expressions FAQ

Answer 2

请尝试以下操作。（如果您的Input_file由TAB分隔，请使用-F"\t"

awk 'match($0,/ gene_name[^;]*/){print substr($0,1,RSTART-1) substr($0,RSTART+RLENGTH+1);next} 1' Input_file

现在也添加一种非线性形式的解决方案并进行解释。

awk '
match($0,/ gene_name[^;]*/){                               ##Using match function of awk where checking regex from keyword gene_name till semi colon.
  print substr($0,1,RSTART-1) substr($0,RSTART+RLENGTH+1)  ##Printing substring from 1st character to till RSTART-1 and then RSTART+RLENGTH+1 to till last, where RSTART and RLENGTH are out of the box keywords whose value will be SET when a regex match is found in match function.
  next                                                     ##next is out of box keyword which will skip all further statements from here.
}
1                                                          ##Mentioning 1 will print the lines which do not have match of above regex for gene_name one.
' Input_file                                               ##Mentioning Input_file name here.

Answer 3

您有一些字段用制表符分隔，而另一些字段则用分号分隔，后跟可选的空格。您可以使用FS="\t|; ?"告诉awk进行拆分，这将正确识别您的字段，但是每个字段周围的特定分隔符将不会保留，您稍后需要它们将记录放回去。这就是GNU awks split()函数被赋予第4个arg的原因，因此它可以保存字段和分隔符。在您的情况下，您可以将其用作：

nf = split($0,flds,/\t|; ?/,seps)

看看对输入的第一条记录有什么作用

$ cat tst.awk
{
    nf = split($0,flds,/\t|; ?/,seps)
}
NR == 1 {
    printf "$0=<%s>\n", $0
    for (i=1; i<=nf; i++) {
        printf "  flds[%d] = <%s>\n", i, flds[i]
        printf "  seps[%d] = <%s>\n", i, seps[i]
    }
}

。

$ awk -f tst.awk file
$0=<SL3.0ch00   maker_ITAG      exon    16480   16794   .       +       .       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_name "Solyc00g005000.3"; gene_biotype "protein_coding";>
  flds[1] = <SL3.0ch00>
  seps[1] = <   >
  flds[2] = <maker_ITAG>
  seps[2] = <   >
  flds[3] = <exon>
  seps[3] = <   >
  flds[4] = <16480>
  seps[4] = <   >
  flds[5] = <16794>
  seps[5] = <   >
  flds[6] = <.>
  seps[6] = <   >
  flds[7] = <+>
  seps[7] = <   >
  flds[8] = <.>
  seps[8] = <   >
  flds[9] = <transcript_id "mRNA:Solyc00g005000.3.1">
  seps[9] = <; >
  flds[10] = <gene_id "gene:Solyc00g005000.3">
  seps[10] = <; >
  flds[11] = <gene_name "Solyc00g005000.3">
  seps[11] = <; >
  flds[12] = <gene_biotype "protein_coding">
  seps[12] = <;>
  flds[13] = <>
  seps[13] = <>

看看您不仅可以访问flds[]数组中的每个字段，而且还可以访问seps[]数组中的每个字段周围的分隔符？因此，要删除字段，只需将数组中的相应元素设置为null并重新组合记录：

$ cat tst.awk
{
    nf = split($0,flds,/\t|; ?/,seps)

    flds[11] = seps[11] = ""

    $0 = join(nf,flds,seps)

    print
}
function join(n,f,s,   i,o) {for (i=1;i<=n;i++) o=o f[i] s[i]; return o}

。

$ awk -f tst.awk file
SL3.0ch00       maker_ITAG      exon    16480   16794   .       +       .       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00       maker_ITAG      exon    16879   17940   .       +       .       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00       maker_ITAG      CDS     16480   16794   .       +       0       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";
SL3.0ch00       maker_ITAG      CDS     16879   17940   .       +       0       transcript_id "mRNA:Solyc00g005000.3.1"; gene_id "gene:Solyc00g005000.3"; gene_biotype "protein_coding";

如何在不更改格式的情况下删除文件中的特定列

3 个答案: