拆分数据框内一行内的列字符串元素

时间:2015-02-19 13:15:18

标签: r split dataframe

我有一个像这样的矩阵(1000 x 2830):

        9178    3574    3547
160     B_B     B_B      A_A
301     B_B     A_B      A_B
303     B_B     B_B      A_A
311     A_B     A_B      A_A
312     B_B     A_B      A_A
314     B_B     A_B      A_A

我希望获得以下内容(复制colnames并拆分每列的每个元素):

      9178   9178   3574   3574   3547   3547
160     B      B      B      B      A      A
301     B      B      A      B      A      B
303     B      B      B      B      A      A
311     A      B      A      B      A      A
312     B      B      A      B      A      A
314     B      B      A      B      A      A

我尝试使用strsplit,但我收到错误消息,因为这是一个矩阵,而不是字符串。你能否提出一些解决这个问题的想法?

4 个答案:

答案 0 :(得分:7)

以下是使用dplyr(适用于bind_cols)和tidyr(适用于separate_)以及来自基地R的lapply的选项。它假设您的数据是data.frame(即您可能需要先将其转换为data.frame):

library(dplyr)
library(tidyr)

lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>% 
  bind_cols
#  X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
#1       B       B       B       B       A       A
#2       B       B       A       B       A       B
#3       B       B       B       B       A       A
#4       A       B       A       B       A       A
#5       B       B       A       B       A       A
#6       B       B       A       B       A       A

答案 1 :(得分:6)

我有偏见,但我建议在我的“splitstackshape”软件包中使用cSplit。由于您的输入中似乎有rownames,因此请使用as.data.table(., keep.rownames = TRUE)

library(splitstackshape)
cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
#     rn X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
# 1: 160       B       B       B       B       A       A
# 2: 301       B       B       A       B       A       B
# 3: 303       B       B       B       B       A       A
# 4: 311       A       B       A       B       A       A
# 5: 312       B       B       A       B       A       A
# 6: 314       B       B       A       B       A       A

cSplit更易读(但目前可能更快)是使用“stringi”中的stri_split_fixed,如下所示:

library(stringi)
`dimnames<-`(do.call(cbind, 
                     lapply(mydf, stri_split_fixed, "_", simplify = TRUE)), 
             list(rownames(mydf), rep(colnames(mydf), each = 2)))
#     X9178 X9178 X3574 X3574 X3547 X3547
# 160 "B"   "B"   "B"   "B"   "A"   "A"  
# 301 "B"   "B"   "A"   "B"   "A"   "B"  
# 303 "B"   "B"   "B"   "B"   "A"   "A"  
# 311 "A"   "B"   "A"   "B"   "A"   "A"  
# 312 "B"   "B"   "A"   "B"   "A"   "A"  
# 314 "B"   "B"   "A"   "B"   "A"   "A" 

如果速度至关重要,我建议您查看"iotools" package,尤其是mstrsplit功能。该方法类似于“stringi”方法:

library(iotools)
`dimnames<-`(do.call(cbind, 
                lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
             list(rownames(mydf), rep(colnames(mydf), each = 2)))

如果您在从lapply(mydf, as character)转换为stringsAsFactors = FALSE时忘记使用matrix,则可能需要在其中添加data.frame,但即使是stri_split方法。

答案 2 :(得分:4)

你可以做的事情,虽然它似乎有点&#34;扭曲&#34; (yourmat是你的矩阵)......:

inter<-data.frame(t(sapply(as.vector(yourmat), function(x) {
                                                 strsplit(x, "_")[[1]]
                                             })),
                   row.names=paste0(rep(colnames(yourmat), e=nrow(yourmat)), 1:nrow(yourmat)),
                   stringsAsFactors=F)
res<-do.call("cbind", 
              split(inter, factor(substr(row.names(inter), 1, 4), level = colnames(yourmat))))
res
#       9178.X1 9178.X2 3574.X1 3574.X2 3547.X1 3547.X2
# 91781       B       B       B       B       A       A
# 91782       B       B       A       B       A       B
# 91783       B       B       B       B       A       A
# 91784       A       B       A       B       A       A
# 91785       B       B       A       B       A       A
# 91786       B       B       A       B       A       A

修改
如果您希望row.names resyourmat中的row.names(res)<-row.names(yourmat) 相同,则可以执行以下操作:

yourmat

注意: 如果data.framematrix而不是as.vector unlist行需要更改为{{1}}。

答案 3 :(得分:2)

不使用数据框的基础R解决方案:

# split
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))

# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]

# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)

#    9178  9178  3574  3574  3547  3547
# 160 "B"   "B"   "A"   "B"   "B"   "A"  
# 301 "B"   "A"   "A"   "B"   "B"   "B"  
# 303 "B"   "B"   "A"   "B"   "B"   "A"  
# 311 "A"   "A"   "A"   "B"   "B"   "A"  
# 312 "B"   "A"   "A"   "B"   "B"   "A"  
# 314 "B"   "A"   "A"   "B"   "B"   "A"  

<强> [更新] 这是对提议的解决方案的一个小型基准研究(我没有包括cSplit解决方案,因为它太慢了):

<强>设定:

m <- matrix('A_B',nrow=1000,ncol=2830)
d <- as.data.frame(m, stringsAsFactors = FALSE)

##### 
f.mtrx <- function(m) {
  z <- unlist(strsplit(m,'_'))
  M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))

  # properly order columns
  i <- 1:ncol(M)
  M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]

  # set dimnames
  rownames(M) <- rownames(m)
  colnames(M) <- rep(colnames(m),each=2)
  M
}

library(stringi)
f.mtrx2 <- function(m) {
  z <- unlist(stri_split_fixed(m,'_'))
  M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))

  # properly order columns
  i <- 1:ncol(M)
  M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]

  # set dimnames
  rownames(M) <- rownames(m)
  colnames(M) <- rep(colnames(m),each=2)
  M
}

#####
library(splitstackshape)
f.cSplit <- function(mydf) cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")

#####
library(stringi)
f.stringi <- function(mydf) `dimnames<-`(do.call(cbind, 
                     lapply(mydf, stri_split_fixed, "_", simplify = TRUE)), 
             list(rownames(mydf), rep(colnames(mydf), each = 2)))

#####
library(dplyr)
library(tidyr)

f.dplyr <- function(df) lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>% 
  bind_cols

#####
library(iotools)
f.mstrsplit <- function(mydf) `dimnames<-`(do.call(cbind, 
                     lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
             list(rownames(mydf), rep(colnames(mydf), each = 2)))



#####
library(rbenchmark)

benchmark(f.mtrx(m), f.mtrx2(m), f.dplyr(d), f.stringi(d), f.mstrsplit(d), replications = 10)

<强>结果:

      test replications elapsed relative user.self sys.self user.child sys.child
3     f.dplyr(d)           10  27.722   10.162    27.360    0.269          0         0
5 f.mstrsplit(d)           10   2.728    1.000     2.607    0.098          0         0
1      f.mtrx(m)           10  37.943   13.909    34.885    0.799          0         0
2     f.mtrx2(m)           10  15.176    5.563    13.936    0.802          0         0
4   f.stringi(d)           10   8.107    2.972     7.815    0.247          0         0

在更新的基准测试中,获胜者为f.mstrsplit