我有一个像这样的矩阵(1000 x 2830):
9178 3574 3547
160 B_B B_B A_A
301 B_B A_B A_B
303 B_B B_B A_A
311 A_B A_B A_A
312 B_B A_B A_A
314 B_B A_B A_A
我希望获得以下内容(复制colnames并拆分每列的每个元素):
9178 9178 3574 3574 3547 3547
160 B B B B A A
301 B B A B A B
303 B B B B A A
311 A B A B A A
312 B B A B A A
314 B B A B A A
我尝试使用strsplit
,但我收到错误消息,因为这是一个矩阵,而不是字符串。你能否提出一些解决这个问题的想法?
答案 0 :(得分:7)
以下是使用dplyr
(适用于bind_cols
)和tidyr
(适用于separate_
)以及来自基地R的lapply
的选项。它假设您的数据是data.frame(即您可能需要先将其转换为data.frame):
library(dplyr)
library(tidyr)
lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
# X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
#1 B B B B A A
#2 B B A B A B
#3 B B B B A A
#4 A B A B A A
#5 B B A B A A
#6 B B A B A A
答案 1 :(得分:6)
我有偏见,但我建议在我的“splitstackshape”软件包中使用cSplit
。由于您的输入中似乎有rownames
,因此请使用as.data.table(., keep.rownames = TRUE)
:
library(splitstackshape)
cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
# rn X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
# 1: 160 B B B B A A
# 2: 301 B B A B A B
# 3: 303 B B B B A A
# 4: 311 A B A B A A
# 5: 312 B B A B A A
# 6: 314 B B A B A A
比cSplit
更易读(但目前可能更快)是使用“stringi”中的stri_split_fixed
,如下所示:
library(stringi)
`dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
# X9178 X9178 X3574 X3574 X3547 X3547
# 160 "B" "B" "B" "B" "A" "A"
# 301 "B" "B" "A" "B" "A" "B"
# 303 "B" "B" "B" "B" "A" "A"
# 311 "A" "B" "A" "B" "A" "A"
# 312 "B" "B" "A" "B" "A" "A"
# 314 "B" "B" "A" "B" "A" "A"
如果速度至关重要,我建议您查看"iotools" package,尤其是mstrsplit
功能。该方法类似于“stringi”方法:
library(iotools)
`dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
如果您在从lapply(mydf, as character)
转换为stringsAsFactors = FALSE
时忘记使用matrix
,则可能需要在其中添加data.frame
,但即使是stri_split
方法。
答案 2 :(得分:4)
你可以做的事情,虽然它似乎有点&#34;扭曲&#34; (yourmat
是你的矩阵)......:
inter<-data.frame(t(sapply(as.vector(yourmat), function(x) {
strsplit(x, "_")[[1]]
})),
row.names=paste0(rep(colnames(yourmat), e=nrow(yourmat)), 1:nrow(yourmat)),
stringsAsFactors=F)
res<-do.call("cbind",
split(inter, factor(substr(row.names(inter), 1, 4), level = colnames(yourmat))))
res
# 9178.X1 9178.X2 3574.X1 3574.X2 3547.X1 3547.X2
# 91781 B B B B A A
# 91782 B B A B A B
# 91783 B B B B A A
# 91784 A B A B A A
# 91785 B B A B A A
# 91786 B B A B A A
修改强>
如果您希望row.names
res
与yourmat
中的row.names(res)<-row.names(yourmat)
相同,则可以执行以下操作:
yourmat
注意: 如果data.frame
是matrix
而不是as.vector
unlist
行需要更改为{{1}}。
答案 3 :(得分:2)
不使用数据框的基础R解决方案:
# split
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
# 9178 9178 3574 3574 3547 3547
# 160 "B" "B" "A" "B" "B" "A"
# 301 "B" "A" "A" "B" "B" "B"
# 303 "B" "B" "A" "B" "B" "A"
# 311 "A" "A" "A" "B" "B" "A"
# 312 "B" "A" "A" "B" "B" "A"
# 314 "B" "A" "A" "B" "B" "A"
<强> [更新] 强>
这是对提议的解决方案的一个小型基准研究(我没有包括cSplit
解决方案,因为它太慢了):
<强>设定:强>
m <- matrix('A_B',nrow=1000,ncol=2830)
d <- as.data.frame(m, stringsAsFactors = FALSE)
#####
f.mtrx <- function(m) {
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
library(stringi)
f.mtrx2 <- function(m) {
z <- unlist(stri_split_fixed(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
#####
library(splitstackshape)
f.cSplit <- function(mydf) cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
#####
library(stringi)
f.stringi <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(dplyr)
library(tidyr)
f.dplyr <- function(df) lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
#####
library(iotools)
f.mstrsplit <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(rbenchmark)
benchmark(f.mtrx(m), f.mtrx2(m), f.dplyr(d), f.stringi(d), f.mstrsplit(d), replications = 10)
<强>结果:强>
test replications elapsed relative user.self sys.self user.child sys.child
3 f.dplyr(d) 10 27.722 10.162 27.360 0.269 0 0
5 f.mstrsplit(d) 10 2.728 1.000 2.607 0.098 0 0
1 f.mtrx(m) 10 37.943 13.909 34.885 0.799 0 0
2 f.mtrx2(m) 10 15.176 5.563 13.936 0.802 0 0
4 f.stringi(d) 10 8.107 2.972 7.815 0.247 0 0
在更新的基准测试中,获胜者为f.mstrsplit
。