我在R中有一个具有1000行的数据框,变量var1_string是一个看起来像这样的字符串:
var_1_ID var1_string
1 "object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab.."
2 "object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab.."
and so on for n rows...
有没有一种快速的方法可以将以下内容提取到新列中:
new_column_1 = "object.ID = 00001, object.ID.N = 1"
new_column_2到new_column_n将包含"object.subfeature.ID
....至下一个object.subfeature.ID
,依此类推,直到没有更多子功能.....
因此在此示例中:
new_column_2 = object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False
,
然后,下一列包含下一个object.subfeature.ID
块,例如object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50
最后是所有object.features
的单独列。.\
例如所需的输出:
Var_1_ID Var1_string New_col1 New_col2 New_col3
1 String as above object.ID = 00001, object.ID.N = 1 object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50
object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab..
2 String as above object.ID = 00001, object.ID.N = 1
有使用子串的方法吗?还是您必须使用stringr::str_extract / dplyr::mutate(strsplit...
)?
注意:可以有任意数量的object.subfeatures,例如`object.subfeature.ID = 55555及其部分,范围从0到n
答案 0 :(得分:0)
我认为这可能是一个好的开始
library(tidyverse)
df <- data_frame(Var_1_ID = c(1,2), Var1_string = c("object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd", "object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd"))
df2 <- df %>% mutate(newstring = str_split(Var1_string, ",")) %>%
unnest() %>%
separate(newstring, into = c("col", "val"), sep = "=") %>%
select(-Var1_string)%>%
group_by(Var_1_ID)%>%
mutate(key = row_number())%>%
unite("new_var", key, col, sep = "_")%>%
spread(new_var, val)
df2
Var_1_ID `1_object.ID ` `10_ object.featu… `2_ object.ID.N… `3_ object.subfea…
1 1 " 00001" " cd" " 1" " 55555" " 1"
2 2 " 00001" " cd" " 1" " 55555" " 1"
请注意,我必须在变量中添加一个数字,以使每一列都可以唯一地散布。
答案 1 :(得分:0)
#Prepare data to be split on `;`
data$Var1_string <- gsub("(object.ID.N = [0-9]),",'\\1;',data$Var1_string)
data$Var1_string <- gsub("(object.subfeature.e[0-9] = [0-9a-zA-Z]+)[^a-z]",'\\1;',data$Var1_string)
data$Var1_string <- gsub("(object.subfeature.ID = [0-9a-zA-Z]+)[^a-z]",'\\1;',data$Var1_string)
ncol <- max(lengths(strsplit(data$Var1_string,split = ';')))
library(tidyr)
data <- data %>% separate(Var1_string,into = paste0('Col',1:ncol),sep=';',remove = FALSE)
colnames(data)[3:11] <- c('New_col','object.subfeature.ID.e1','object.subfeature.e1','object.subfeature.e2','object.subfeature.ID.e11','object.subfeature.e11','object.subfeature.e14','object.subfeature.e19','object.features')
data %>% mutate_at(vars(contains('object.subfeature')),
.funs = funs(gsub("object.subfeature.e[0-9] = |object.subfeature.ID = ", '', .)))
Var_1_ID
1 1
2 2
Var1_string
1 object.ID = 00001, object.ID.N = 1; object.subfeature.ID = 55555; object.subfeature.e1 = 1; object.subfeature.e2 = False; object.subfeature.ID = 66666;object.subfeature.e1 = 2; object.subfeature.e4 = 50;object.subfeature.e9 = abc; object.feature = 3, object.feature = cd, object.feature = ab..
2 object.ID = 00001, object.ID.N = 1; object.subfeature.ID = 55555; object.subfeature.e1 = 1; object.subfeature.e2 = False; object.subfeature.ID = 66666;object.subfeature.e1 = 2; object.subfeature.e4 = 50;object.subfeature.e9 = abc; object.feature = 3, object.feature = cd, object.feature = ab..
New_col object.subfeature.ID.e1 object.subfeature.e1
1 object.ID = 00001, object.ID.N = 1 55555 1
2 object.ID = 00001, object.ID.N = 1 55555 1
object.subfeature.e2 object.subfeature.ID.e11 object.subfeature.e11 object.subfeature.e14
1 False 66666 2 50
2 False 66666 2 50
object.subfeature.e19
object.features
1 abc object.feature = 3, object.feature = cd,
object.feature = ab..
2 abc object.feature = 3, object.feature = cd,
object.feature = ab..
data <- read.table(text="
Var_1_ID Var1_string
1 'object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab..'
2 'object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab..'
",header=T, stringsAsFactors = F)