我有基因列表的data.frame。
样本数据-
gene_name "PPAP2C"
gene_name "PPaw2C"
gene_name "PAP2C"
gene_name "APAP2C"
gene_name "PP102C"
我想将此数据分为两列
结果数据应为-
PPAP2C
PPaw2C
PAP2C
APAP2C
PP102C
我尝试使用-
xx = x4_1%>% separate(x4_1, c("A","B") , " " )
错误-
错误:var必须求值为单个数字或列名,而不是列表 调用rlang :: last_error()查看回溯
答案 0 :(得分:0)
您的separate
方法对我有用:
library(tidyr)
df <- data.frame("genes" = c("gene_name \"PPAP2C\"",
"gene_name \"PPaw2C\"",
"gene_name \"PAP2C\"",
"gene_name \"APAP2C\"",
"gene_name \"PP102C\""), stringsAsFactors = FALSE)
> df
genes
1 gene_name "PPAP2C"
2 gene_name "PPaw2C"
3 gene_name "PAP2C"
4 gene_name "APAP2C"
5 gene_name "PP102C"
df <- separate(df, genes, into = c("geneName", "geneCode"), sep = " ")
> df
geneName geneCode
1 gene_name "PPAP2C"
2 gene_name "PPaw2C"
3 gene_name "PAP2C"
4 gene_name "APAP2C"
5 gene_name "PP102C"
#Remove any special characters such as quotes:
df$geneCode <- gsub("[[:punct:]]", "", df$geneCode)
答案 1 :(得分:0)
您快到了,该函数的帮助(使用类型?separate
进行访问)可以指定要给出的参数和示例。
library(dplyr)
tt <- c("PPAP2C",
"PPaw2C",
"PAP2C",
"APAP2C",
"PP102C")
tt <- paste0("gene_name", 1:5, " ", tt)
dframe <- tibble("col_to_split" = tt)
dframe %>% tidyr::separate(col = "col_to_split", into = c("col1", "col2"), sep = "\\s")
NB:如果您更多地处理字符串(显然是^^),我建议您查看软件包stringr
(操作)和stringi
(编码):
http://edrub.in/CheatSheets/cheatSheetStringr.pdf
答案 2 :(得分:0)
library(stringr)
df <- data.frame("genes" = c("gene_name \"PPAP2C\"",
"gene_name \"PPaw2C\"",
"gene_name \"PAP2C\"",
"gene_name \"APAP2C\"",
"gene_name \"PP102C\""), stringsAsFactors = FALSE)
df
# genes
# 1 gene_name "PPAP2C"
# 2 gene_name "PPaw2C"
# 3 gene_name "PAP2C"
# 4 gene_name "APAP2C"
# 5 gene_name "PP102C"
df2 <- as.data.frame(str_split(string = df$genes, pattern = ' ', simplify = T))
names(df2) <- c("geneName", "geneCode")
df2$geneCode <- gsub(pattern = '"', replacement = '', x = df2$geneCode)
# geneName geneCode
# 1 gene_name PPAP2C
# 2 gene_name PPaw2C
# 3 gene_name PAP2C
# 4 gene_name APAP2C
# 5 gene_name PP102C