删除重复的字符串后,合并data.frame中的列

时间:2014-12-19 05:48:43

标签: r string dataframe data.table strsplit

我有一个data.frame data个字符向量,如下所示。

x <- c("kal, Kon, Jor, Kara", "Bruce, Helena, Martha, Terry", "connor, oliver, Roy",  
       "Alan, Guy, Simon, Kyle")
y <- c("Mon, Cir, John, Jor", "Damian, Terry, Jason", "Mia, Roy", "John, Cary")
data <- data.frame(x,y, stringsAsFactors=FALSE)

我正在尝试将两列xy中的字符串连接到新列z。我想删除重复项并在连接字符串之前对,分隔的单词进行排序。我能够实现如下目标。

x <- strsplit(data$x, split=", ")
y <- strsplit(data$y, split=", ")
data$z <- sapply(1:length(x), function(i) paste(sort(union(x[[i]], y[[i]])), 
                                                collapse=", "))

有没有更快的方法来创建中间列表,可能使用data.table

2 个答案:

答案 0 :(得分:5)

您可以尝试regex解决方案。但是,这不会像你想要的那样排序。

v1 <- paste(data[,1], data[,2], sep=", ")
data$z <- sub('(\\b\\S+\\b)(?=.*\\b\\1\\b.*),', "", v1, perl=TRUE)

可以在regex101

查看正则表达式

其他选项包括

library(splitstackshape)
library(data.table)
cbind(data[,1:2, with=FALSE],cSplit(setDT(data)[, indx:=1:.N],
      c('x', 'y'), sep=",", 'long')[ ,
     list(z=toString(unique(na.omit(unlist(.SD))))),
                           by=indx][,indx:=NULL])

                                 x                    y
 #1:          kal, Kon, Jor, Kara  Mon, Cir, John, Jor
 #2: Bruce, Helena, Martha, Terry Damian, Terry, Jason
 #3:          connor, oliver, Roy             Mia, Roy
 #4:       Alan, Guy, Simon, Kyle           John, Cary
  #                                       z
 #1:         kal, Kon, Jor, Kara, Mon, Cir, John
 #2: Bruce, Helena, Martha, Terry, Damian, Jason
 #3:                    connor, oliver, Roy, Mia
 #4:          Alan, Guy, Simon, Kyle, John, Cary

或使用stringi

 library(stringi)
 data$z <- vapply(stri_extract_all_regex(paste(data$x, data$y), '\\w+'),
                function(x) toString(sort(unique(x))), character(1))

基准

基于不太大的数据集,

 data <- data[rep(1:nrow(data), 3e4),]
 row.names(data) <- NULL

 cath <- function(){
       apply(data,1,function(vec){
                    paste(sort(unique(strsplit(paste(vec[1],
                   vec[2],sep=", "),", ")[[1]])),collapse=", ")
                  })
       }

 akrun2 <- function(){
         vapply(stri_extract_all_regex(paste(data$x, data$y), '\\w+'),
                    function(x) toString(sort(unique(x))), character(1))
      }

 akrun3 <- function(){
    v1 <- paste(data[,1], data[,2], sep=", ")
    sub('(\\b\\S+\\b)(?=.*\\b\\1\\b.*),', "", v1, perl=TRUE) 
   }

 microbenchmark(cath(), akrun2(), akrun3(),unit='relative', times=10L)
 #Unit: relative
 #   expr       min        lq      mean   median       uq      max neval cld
 # cath() 11.700071 11.979908 11.700118 11.76762 11.57583 11.40806    10   c
 #akrun2()  7.175622  7.225212  7.217322  7.19431  7.09539  7.31929    10  b 
 #akrun3()  1.000000  1.000000  1.000000  1.00000  1.00000  1.00000    10  a  

答案 1 :(得分:3)

为了进一步了解您的想法,您可以这样做,而无需创建中间列表:

data$z<-apply(data,1,function(vec){
                        paste(unique(strsplit(paste(vec[1],vec[2],sep=", "),", ")[[1]]),collapse=", ")
                      })

> data
                             x                    y                                           z
1          kal, Kon, Jor, Kara  Mon, Cir, John, Jor         kal, Kon, Jor, Kara, Mon, Cir, John
2 Bruce, Helena, Martha, Terry Damian, Terry, Jason Bruce, Helena, Martha, Terry, Damian, Jason
3          connor, oliver, Roy             Mia, Roy                    connor, oliver, Roy, Mia
4       Alan, Guy, Simon, Kyle           John, Cary          Alan, Guy, Simon, Kyle, John, Cary

虽然速度较慢,但​​基础R并没有那么糟糕,基于@akrun的3e4行数据集:

>  microbenchmark(cath(), akrun2(), unit='relative', times=100L)
Unit: relative
     expr      min       lq     mean   median       uq      max neval cld
   cath() 1.429732 1.425991 1.427143 1.427015 1.435986 1.360235   100   b
 akrun2() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100  a