我有一个名为mymat
的矩阵。我想创建另一个矩阵,其中mymat中所有项目的成对组合和values
加在一起,得到类似result
的内容。
mymat<- structure(c("AOGC-03-0122", "AOGC-05-0009", "AOGC-08-0006", "AOGC-08-0032",
"AOGC-08-0054", "0.000971685122254438", "0.00114138129544444",
"0.000779586347096811", "0.00132807674454652", "0.000867219894408284"
), .Dim = c(5L, 2L), .Dimnames = list(NULL, c("samples", "value"
)))
结果
combination total.value
AOGC-03-0122+AOGC-03-0122 0.00194337
AOGC-03-0122+AOGC-05-0009 0.002113066
.
.
.
AOGC-08-0054+AOGC-08-0054 0.00173444
答案 0 :(得分:2)
矩阵是同类数据对象。它基本上是一个matrix
- 分类的原子向量,具有维度属性(忽略列表矩阵的情况)。您不能在单个矩阵中使用字符串和数字的组合。如果要存储具有异构列类型的数据表,则应使用data.frame。绝对看来samples
和value
列的适当类型分别是字符串和数字。因此,您的输入矩阵应该是data.frame,并且您的输出也应该是data.frame,因为它只是置换输入记录。
您不需要在此处拨打merge()
,当然也不需要两次;矢量化索引可以完成这项工作。使用merge()
将导致置换顺序依赖于samples
值的词典顺序,而不是它们在输入中出现的顺序,这可能是不合需要的。
values <- as.double(mymat[,'value']);
with(expand.grid(rep(list(seq_len(nrow(mymat))),2L)),
data.frame(
combination=paste(mymat[Var2,'samples'],mymat[Var1,'samples'],sep='+'),
total.value=values[Var2]+values[Var1]
)
);
## combination total.value
## 1 AOGC-03-0122+AOGC-03-0122 0.001943370
## 2 AOGC-03-0122+AOGC-05-0009 0.002113066
## 3 AOGC-03-0122+AOGC-08-0006 0.001751271
## 4 AOGC-03-0122+AOGC-08-0032 0.002299762
## 5 AOGC-03-0122+AOGC-08-0054 0.001838905
## 6 AOGC-05-0009+AOGC-03-0122 0.002113066
## 7 AOGC-05-0009+AOGC-05-0009 0.002282763
## 8 AOGC-05-0009+AOGC-08-0006 0.001920968
## 9 AOGC-05-0009+AOGC-08-0032 0.002469458
## 10 AOGC-05-0009+AOGC-08-0054 0.002008601
## 11 AOGC-08-0006+AOGC-03-0122 0.001751271
## 12 AOGC-08-0006+AOGC-05-0009 0.001920968
## 13 AOGC-08-0006+AOGC-08-0006 0.001559173
## 14 AOGC-08-0006+AOGC-08-0032 0.002107663
## 15 AOGC-08-0006+AOGC-08-0054 0.001646806
## 16 AOGC-08-0032+AOGC-03-0122 0.002299762
## 17 AOGC-08-0032+AOGC-05-0009 0.002469458
## 18 AOGC-08-0032+AOGC-08-0006 0.002107663
## 19 AOGC-08-0032+AOGC-08-0032 0.002656153
## 20 AOGC-08-0032+AOGC-08-0054 0.002195297
## 21 AOGC-08-0054+AOGC-03-0122 0.001838905
## 22 AOGC-08-0054+AOGC-05-0009 0.002008601
## 23 AOGC-08-0054+AOGC-08-0006 0.001646806
## 24 AOGC-08-0054+AOGC-08-0032 0.002195297
## 25 AOGC-08-0054+AOGC-08-0054 0.001734440
bgoldst <- function(mymat) { values <- as.double(mymat[,'value']); with(expand.grid(rep(list(seq_len(nrow(mymat))),2L)),data.frame(combination=paste(mymat[Var2,'samples'],mymat[Var1,'samples'],sep='+'),total.value=values[Var2]+values[Var1])); };
akrun <- function(mymat) { d1 <- expand.grid(rep(list(mymat[, "samples"]),2)); d2 <- data.frame(samples=mymat[,1], value = as.numeric(mymat[,2]), stringsAsFactors=FALSE); d3 <- merge(merge(d1, d2, by.x="Var1", by.y="samples", all.x=TRUE), d2, by.x="Var2", by.y= "samples"); res <- data.frame(combination = do.call(paste, c(d3[1:2], sep="+")), total.value = d3[,3]+d3[,4]); };
identical(bgoldst(mymat),akrun(mymat));
## [1] TRUE
library(microbenchmark);
microbenchmark(bgoldst(mymat),akrun(mymat));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(mymat) 390.875 412.685 444.4554 433.8535 457.589 662.434 100
## akrun(mymat) 1603.697 1658.009 1789.0585 1692.0075 1824.793 3227.921 100
N <- 1e3; mymat <- matrix(c(sprintf('sample_%d',seq_len(N)),runif(N)),ncol=2L,dimnames=list(NULL,c('samples','value')));
x <- bgoldst(mymat); y <- akrun(mymat); identical(structure(transform(x[order(x$combination),],combination=as.character(combination)),row.names=seq_len(nrow(x))),structure(transform(y[order(y$combination),],combination=as.character(combination)),row.names=seq_len(nrow(y)))); ## annoyingly involved line of code to obviate row order, factor levels order, and row names differences
## [1] TRUE
microbenchmark(bgoldst(mymat),akrun(mymat),times=3L);
## Unit: seconds
## expr min lq mean median uq max neval
## bgoldst(mymat) 8.103589 8.328722 8.418285 8.553854 8.575633 8.597411 3
## akrun(mymat) 30.777301 31.152458 31.348615 31.527615 31.634272 31.740929 3
答案 1 :(得分:1)
我们可以将expand.grid
与merge
d1 <- expand.grid(rep(list(mymat[, "samples"]),2))
d2 <- data.frame(samples=mymat[,1], value = as.numeric(mymat[,2]),
stringsAsFactors=FALSE)
d3 <- merge(merge(d1, d2, by.x="Var1", by.y="samples", all.x=TRUE),
d2, by.x="Var2", by.y= "samples")
res <- data.frame(combination = do.call(paste, c(d3[1:2], sep="+")),
total.value = d3[,3]+d3[,4])
head(res,3)
# combination total.value
#1 AOGC-03-0122+AOGC-03-0122 0.001943370
#2 AOGC-03-0122+AOGC-05-0009 0.002113066
#3 AOGC-03-0122+AOGC-08-0006 0.001751271