R:将字符串元素提取到新列

时间:2018-08-03 11:14:30

标签: r string dplyr substring mutate

我在R中有一个具有1000行的数据框,变量var1_string是一个看起来像这样的字符串:

var_1_ID   var1_string
1          "object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab.."
2          "object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab.."

and so on for n rows...

有没有一种快速的方法可以将以下内容提取到新列中:

new_column_1 = "object.ID = 00001, object.ID.N = 1"

new_column_2到new_column_n将包含"object.subfeature.ID ....至下一个object.subfeature.ID,依此类推,直到没有更多子功能.....

因此在此示例中:

new_column_2 = object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False

然后,下一列包含下一个object.subfeature.ID块,例如object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50

最后是所有object.features的单独列。.\

例如所需的输出:

Var_1_ID   Var1_string      New_col1                             New_col2                                                                         New_col3
1          String as above  object.ID = 00001, object.ID.N = 1   object.subfeature.ID = 55555,  object.subfeature.e1 = 1, object.subfeature.e2 = False                    object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 
                            object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab..
2          String as above  object.ID = 00001, object.ID.N = 1

有使用子串的方法吗?还是您必须使用stringr::str_extract / dplyr::mutate(strsplit...)?

注意:可以有任意数量的object.subfeatures,例如`object.subfeature.ID = 55555及其部分,范围从0到n

2 个答案:

答案 0 :(得分:0)

我认为这可能是一个好的开始

library(tidyverse)
df <- data_frame(Var_1_ID = c(1,2), Var1_string = c("object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd", "object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd"))

df2 <- df %>% mutate(newstring = str_split(Var1_string, ",")) %>% 
    unnest() %>%
    separate(newstring, into = c("col", "val"), sep = "=") %>% 
    select(-Var1_string)%>%
    group_by(Var_1_ID)%>%
    mutate(key = row_number())%>%
    unite("new_var", key, col, sep = "_")%>%
    spread(new_var, val)

df2
  Var_1_ID `1_object.ID ` `10_ object.featu… `2_ object.ID.N… `3_ object.subfea… 
1        1 " 00001"       " cd"              " 1"             " 55555"           " 1"           
2        2 " 00001"       " cd"              " 1"             " 55555"           " 1"        

请注意,我必须在变量中添加一个数字,以使每一列都可以唯一地散布。

答案 1 :(得分:0)

#Prepare data to be split on `;`
data$Var1_string <- gsub("(object.ID.N = [0-9]),",'\\1;',data$Var1_string)
data$Var1_string <- gsub("(object.subfeature.e[0-9] = [0-9a-zA-Z]+)[^a-z]",'\\1;',data$Var1_string)
data$Var1_string <- gsub("(object.subfeature.ID = [0-9a-zA-Z]+)[^a-z]",'\\1;',data$Var1_string)

ncol <- max(lengths(strsplit(data$Var1_string,split = ';')))

library(tidyr)
data <- data %>% separate(Var1_string,into = paste0('Col',1:ncol),sep=';',remove = FALSE)

colnames(data)[3:11] <- c('New_col','object.subfeature.ID.e1','object.subfeature.e1','object.subfeature.e2','object.subfeature.ID.e11','object.subfeature.e11','object.subfeature.e14','object.subfeature.e19','object.features')

data %>% mutate_at(vars(contains('object.subfeature')), 
                  .funs = funs(gsub("object.subfeature.e[0-9] = |object.subfeature.ID = ", '', .)))

  Var_1_ID
1        1
2        2

Var1_string
1 object.ID = 00001, object.ID.N = 1; object.subfeature.ID = 55555; object.subfeature.e1 = 1; object.subfeature.e2 = False; object.subfeature.ID = 66666;object.subfeature.e1 = 2; object.subfeature.e4 = 50;object.subfeature.e9 = abc; object.feature = 3, object.feature = cd, object.feature = ab..
2 object.ID = 00001, object.ID.N = 1; object.subfeature.ID = 55555; object.subfeature.e1 = 1; object.subfeature.e2 = False; object.subfeature.ID = 66666;object.subfeature.e1 = 2; object.subfeature.e4 = 50;object.subfeature.e9 = abc; object.feature = 3, object.feature = cd, object.feature = ab..
                         New_col object.subfeature.ID.e1 object.subfeature.e1
1 object.ID = 00001, object.ID.N = 1                   55555                    1
2 object.ID = 00001, object.ID.N = 1                   55555                    1
object.subfeature.e2 object.subfeature.ID.e11 object.subfeature.e11 object.subfeature.e14
 1                False                    66666                     2                    50
 2                False                    66666                     2                    50
 object.subfeature.e19                                                 
 object.features
 1                   abc  object.feature = 3, object.feature = cd, 
 object.feature = ab..
 2                   abc  object.feature = 3, object.feature = cd, 
 object.feature = ab..

数据

 data <- read.table(text="
         Var_1_ID   Var1_string
         1          'object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab..'
         2          'object.ID = 00001, object.ID.N = 1, object.subfeature.ID = 55555, object.subfeature.e1 = 1, object.subfeature.e2 = False, object.subfeature.ID = 66666,object.subfeature.e1 = 2, object.subfeature.e4 = 50 object.subfeature.e9 = abc, object.feature = 3, object.feature = cd, object.feature = ab..'
               ",header=T, stringsAsFactors = F)