我有随附的数据框。
数据
structure(list(associated_gene = c(NA, NA, "A4GALT", NA, NA,
"NOT FOUND"), chr_name = c("22", "22", "22", "22", "22", "NOT FOUND"
), chrom_start = c(42693910L, 42693843L, 42693321L, 42693665L,
42693653L, 0L), allele = c("G/A/T", "T/C", "G/C", "C/T", "G/A/T",
"NOT FOUND"), refsnp_id = c("rs778598915", "rs11541159", "rs397514502",
"rs762949801", "rs776304817", "NOT FOUND")), row.names = c("s3a",
"s3b", "s3c", "s3d", "s3e", "s3f"), class = "data.frame")
associated_gene chr_name chrom_start allele refsnp_id s3a <NA> 22 42693910 G/A/T rs778598915 s3b <NA> 22 42693843 T/C rs11541159 s3c A4GALT 22 42693321 G/C rs397514502 s3d <NA> 22 42693665 C/T rs762949801 s3e <NA> 22 42693653 G/A/T rs776304817 s3f NOT FOUND NOT FOUND 0 NOT FOUND NOT FOUND
我想将等位基因列的第一个“ /”分为两部分(参考和变量),然后将它们插入到$ chrom_start和$ refsnp_id之间
理想的输出是:
associated_gene chr_name chrom_start Ref Var refsnp_id s3a <NA> 22 42693910 G A/T rs778598915 s3b <NA> 22 42693843 T C rs11541159
我不知道我是否可以加载awk,但是在bash中我会这样做:
猫等位基因| awk -F“ /”'{print $ 1“ \ t” $ 2}'
答案 0 :(得分:1)
我们可以使用Activity
中的extract
从字符串的开头(tidyr
)捕获不是/
([^/]+
)的字符然后是^
,然后捕获其余字符
/
或者另一个选择是library(tidyr)
library(dplyr)
df1 %>%
extract(allele, into = c("Ref", "Var"), "^([^/]+)/(.*)")
# associated_gene chr_name chrom_start Ref Var refsnp_id
#s3a <NA> 22 42693910 G A/T rs778598915
#s3b <NA> 22 42693843 T C rs11541159
#s3c A4GALT 22 42693321 G C rs397514502
#s3d <NA> 22 42693665 C T rs762949801
#s3e <NA> 22 42693653 G A/T rs776304817
#s3f NOT FOUND NOT FOUND 0 <NA> <NA> NOT FOUND
str_split
或使用library(stringr)
do.call(rbind, str_split(df$allele, "/", 2))
创建定界符并在sub
中用read.table/read.csv
读取
base R
答案 1 :(得分:1)
另一种解决方案是使用“字符串”包:
install.packages("stringr")
library(stringr)
数据:
df <- structure(list(associated_gene = c(NA, NA, "A4GALT", NA, NA,
"NOT FOUND"), chr_name = c("22", "22", "22", "22", "22", "NOT FOUND"
), chrom_start = c(42693910L, 42693843L, 42693321L, 42693665L,
42693653L, 0L), allele = c("G/A/T", "T/C", "G/C", "C/T", "G/A/T",
"NOT FOUND"), refsnp_id = c("rs778598915", "rs11541159", "rs397514502",
"rs762949801", "rs776304817", "NOT FOUND")), row.names = c("s3a", "s3b", "s3c", "s3d", "s3e", "s3f"), class = "data.frame")
创建一个包含两个新变量的新df:
new_df <- data.frame(
Ref = str_extract(df$allele, "\\w(?=/)"),
Var = str_extract(df$allele, "(?<=/)\\w.*")
)
new_df
Ref Var
1 G A/T
2 T C
3 G C
4 C T
5 G A/T
6 <NA> <NA>
然后将new_df
与df
进行列绑定(减去现在已经过时的allele
列):
cbind(df[,-4], new_df)
associated_gene chr_name chrom_start refsnp_id Ref Var
s3a <NA> 22 42693910 rs778598915 G A/T
s3b <NA> 22 42693843 rs11541159 T C
s3c A4GALT 22 42693321 rs397514502 G C
s3d <NA> 22 42693665 rs762949801 C T
s3e <NA> 22 42693653 rs776304817 G A/T
s3f NOT FOUND NOT FOUND 0 NOT FOUND <NA> <NA>
答案 2 :(得分:1)
根据最后一行的输出方式,可以使用以下一种:
使用tidyr::separate
library(tidyr)
separate(df, allele,into = c("Ref", "Var"),sep = "/",extra = "merge",fill = "right")
# associated_gene chr_name chrom_start Ref Var refsnp_id
#s3a <NA> 22 42693910 G A/T rs778598915
#s3b <NA> 22 42693843 T C rs11541159
#s3c A4GALT 22 42693321 G C rs397514502
#s3d <NA> 22 42693665 C T rs762949801
#s3e <NA> 22 42693653 G A/T rs776304817
#s3f NOT FOUND NOT FOUND 0 NOT FOUND <NA> NOT FOUND
或与stringr::str_match
stringr::str_match(df$allele, "(.*?)/(.*)")[,c(2, 3)]
# [,1] [,2]
#[1,] "G" "A/T"
#[2,] "T" "C"
#[3,] "G" "C"
#[4,] "C" "T"
#[5,] "G" "A/T"
#[6,] NA NA