grep匹配列中的模式以及r

时间:2015-08-07 13:44:06

标签: r parsing split grep

我正在尝试匹配AdditionalInfo列中的各种键值模式,然后将键值对输出为R中的单独列。

我的单列具有这样的值,键值对由分号(;)分隔:

gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2";

所以我想用grep来匹配“gene_id”E [*]“;”然后将找到的模式输出到新列;使用grep匹配“gene_type”[Aa-Zz]“;”然后将找到的模式输出到新列等

我不能只在分号上拆分列,因为有些行有6个键值对,有些行有13个键值对,它们的顺序不同,它们是唯一的值。

任何人都可以帮我吗?

我尝试使用的代码如下:

geneID <- og[grep("gene_id "E[*]";", og$AdditionalInfo),]

谢谢你的时间!

修改

我的数据如下:

> names(og)
[1] "Chromosome"     "AnnotSource"    "FeatureType"    "Start"          "Stop"          
[6] "Score"          "Strand"         "GenomicPhase"   "AdditionalInfo"

> head(og)
  Chromosome AnnotSource FeatureType Start  Stop Score Strand GenomicPhase
1       chr1      HAVANA        gene 11869 14409     .      +            .
2       chr1      HAVANA  transcript 11869 14409     .      +            .
3       chr1      HAVANA        exon 11869 12227     .      +            .
4       chr1      HAVANA        exon 12613 12721     .      +            .
5       chr1      HAVANA        exon 13221 14409     .      +            .
6       chr1      HAVANA  transcript 12010 13670     .      +            .

AdditionalInfo
1 gene_id ENSG00000223972.5; gene_type transcribed_unprocessed_pseudogene;     gene_status KNOWN; gene_name DDX11L1; level 2; havana_gene OTTHUMG00000000961.2;
2 gene_id ENSG00000223972.5; transcript_id ENST00000456328.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type processed_transcript; transcript_status KNOWN; transcript_name DDX11L1-002; level 2; tag basic; transcript_support_level 1; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000362751.1;
3 gene_id ENSG00000223972.5; transcript_id ENST00000456328.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type processed_transcript; transcript_status KNOWN; transcript_name DDX11L1-002; exon_number 1; exon_id ENSE00002234944.1; level 2; tag basic; transcript_support_level 1; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000362751.1;
4 gene_id ENSG00000223972.5; transcript_id ENST00000456328.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type processed_transcript; transcript_status KNOWN; transcript_name DDX11L1-002; exon_number 2; exon_id ENSE00003582793.1; level 2; tag basic; transcript_support_level 1; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000362751.1;
5 gene_id ENSG00000223972.5; transcript_id ENST00000456328.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type processed_transcript; transcript_status KNOWN; transcript_name DDX11L1-002; exon_number 3; exon_id ENSE00002312635.1; level 2; tag basic; transcript_support_level 1; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000362751.1;
6 gene_id ENSG00000223972.5; transcript_id ENST00000450305.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type transcribed_unprocessed_pseudogene; transcript_status KNOWN; transcript_name DDX11L1-001; level 2; ont PGO:0000005; ont PGO:0000019; tag basic; transcript_support_level NA; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000002844.2;

> dput(head(og))
structure(list(Chromosome = c("chr1", "chr1", "chr1", "chr1", 
"chr1", "chr1"), AnnotSource = c("HAVANA", "HAVANA", "HAVANA", 
"HAVANA", "HAVANA", "HAVANA"), FeatureType = c("gene", "transcript", 
"exon", "exon", "exon", "transcript"), Start = c(11869L, 11869L, 
11869L, 12613L, 13221L, 12010L), Stop = c(14409L, 14409L, 12227L, 
12721L, 14409L, 13670L), Score = c(".", ".", ".", ".", ".", "."
), Strand = c("+", "+", "+", "+", "+", "+"), GenomicPhase = c(".", 
".", ".", ".", ".", "."), AdditionalInfo = c("gene_id ENSG00000223972.5; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; level 2; havana_gene OTTHUMG00000000961.2;", 
"gene_id ENSG00000223972.5; transcript_id ENST00000456328.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type processed_transcript; transcript_status KNOWN; transcript_name DDX11L1-002; level 2; tag basic; transcript_support_level 1; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000362751.1;", 
"gene_id ENSG00000223972.5; transcript_id ENST00000456328.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type processed_transcript; transcript_status KNOWN; transcript_name DDX11L1-002; exon_number 1; exon_id ENSE00002234944.1; level 2; tag basic; transcript_support_level 1; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000362751.1;", 
"gene_id ENSG00000223972.5; transcript_id ENST00000456328.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type processed_transcript; transcript_status KNOWN; transcript_name DDX11L1-002; exon_number 2; exon_id ENSE00003582793.1; level 2; tag basic; transcript_support_level 1; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000362751.1;", 
"gene_id ENSG00000223972.5; transcript_id ENST00000456328.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type processed_transcript; transcript_status KNOWN; transcript_name DDX11L1-002; exon_number 3; exon_id ENSE00002312635.1; level 2; tag basic; transcript_support_level 1; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000362751.1;", 
"gene_id ENSG00000223972.5; transcript_id ENST00000450305.2; gene_type transcribed_unprocessed_pseudogene; gene_status KNOWN; gene_name DDX11L1; transcript_type transcribed_unprocessed_pseudogene; transcript_status KNOWN; transcript_name DDX11L1-001; level 2; ont PGO:0000005; ont PGO:0000019; tag basic; transcript_support_level NA; havana_gene OTTHUMG00000000961.2; havana_transcript OTTHUMT00000002844.2;"
)), .Names = c("Chromosome", "AnnotSource", "FeatureType", "Start", 
"Stop", "Score", "Strand", "GenomicPhase", "AdditionalInfo"), row.names = c(NA, 
6L), class = "data.frame")

1 个答案:

答案 0 :(得分:1)

您可以使用正则表达式和捕获组来选择引号之间gene_id之后的内容。 例如,使用您发布的数据:

sub('.*gene_id ([^;]*).*',"\\1",og$AdditionalInfo)
sub('.*gene_type ([^;]*).*',"\\1",og$AdditionalInfo)

输出:

#[1] "ENSG00000223972.5"
#[1] "transcribed_unprocessed_pseudogene"

如果没有匹配项,您还可以使用str_match中的library(stringr)获取NA

str_match(og$AdditionalInfo,".*transcript_id ([^;]*).*")[,2]

输出

#[1] NA                  "ENST00000456328.2" "ENST00000456328.2" "ENST00000456328.2"
#[5] "ENST00000456328.2" "ENST00000450305.2"