Question

我有一个矩阵，如：

m <- matrix(data=cbind(rnorm(30, 0), rnorm(30, 2), rnorm(30, 5)), nrow=30, ncol=3)

我想要一个输出矩阵，每行之间有差异。结果可能如下所示：

Answer 1

如果它位于每个行组合之间，

t(combn(nrow(m), 2, FUN = function(i) m[i[1],]- m[i[2],]))

或使用expand.grid也包含相同行的差异。

 d1 <- expand.grid(1:nrow(m), 1:nrow(m))
 rn <- do.call(paste, c(d1,  sep=";"))
 res <- t(apply(d1, 1, function(i) m[i[1],] - m[i[2],]))
 row.names(res) <- rn

更新

这是一种有效的方法

 m1 <- m[rep(1:nrow(m), each = nrow(m)),]
 m2 <- m[rep(1:nrow(m), nrow(m)),]
 m1 - m2

基准

N <- 500; set.seed(0)
m <- matrix(rnorm(N * 3), ncol = 3, dimnames = list(NULL, c("x1","x2","x3")))

与 O（N）或其他帖子中描述的内容相比，

system.time({tm <- t(m);
z <- do.call(cbind, lapply(seq_len(ncol(tm)), function (i) tm - tm[, i]));
 row_names <- paste(rep(seq_len(nrow(m)), each = nrow(m)),
                   rep(seq_len(nrow(m)), times = nrow(m)), sep = ";");
 matrix(z, ncol = ncol(m), byrow = TRUE, dimnames = list(row_names, colnames(m)))})
 #   user  system elapsed 
 #  0.25    0.02    0.27

使用新方法

system.time({m1 <- m[rep(1:nrow(m), each = nrow(m)),]
  m2 <- m[rep(1:nrow(m), nrow(m)),]
   m1 - m2})
 #  user  system elapsed 
 #  0.02    0.00    0.02

Answer 2

正如我在my answer中对类似但不相同的question所说的那样，使用lapply比使用combn要快得多。

您可以使用lapply：

执行此操作

tm <- t(m)  ## transpose for column wise operation (for better caching)
z <- do.call(cbind, lapply(seq_len(ncol(tm)), function (i) tm - tm[, i]))
row_names <- paste(rep(seq_len(nrow(m)), each = nrow(m)),
                   rep(seq_len(nrow(m)), times = nrow(m)), sep = ";")
matrix(z, ncol = ncol(m), byrow = TRUE, dimnames = list(row_names, colnames(m)))

考虑一个3 * 3的小例子：

set.seed(0); m <- matrix(rnorm(3 * 3), ncol = 3, dimnames = list(NULL, c("x1","x2","x3")))

我的代码给出了：

#             x1         x2         x3
#1;1  0.00000000  0.0000000  0.0000000
#1;2 -1.58918765 -0.8577879  0.6338466
#1;3  0.06684498 -2.8123794  0.9227999
#2;1  1.58918765  0.8577879 -0.6338466
#2;2  0.00000000  0.0000000  0.0000000
#2;3  1.65603262 -1.9545915  0.2889533
#3;1 -0.06684498  2.8123794 -0.9227999
#3;2 -1.65603262  1.9545915 -0.2889533
#3;3  0.00000000  0.0000000  0.0000000

好吧，也许我应该为那些渴望看到数字的人提供新的基准。

# a data frame with 500 rows
N <- 500; set.seed(0)
m <- matrix(rnorm(N * 3), ncol = 3, dimnames = list(NULL, c("x1","x2","x3")))

## my approach
system.time({tm <- t(m);
z <- do.call(cbind, lapply(seq_len(ncol(tm)), function (i) tm - tm[, i]));
row_names <- paste(rep(seq_len(nrow(m)), each = nrow(m)),
                   rep(seq_len(nrow(m)), times = nrow(m)), sep = ";");
matrix(z, ncol = ncol(m), byrow = TRUE, dimnames = list(row_names, colnames(m)))})
#   user  system elapsed 
#  0.320   0.000   0.318 

## akrun's `combn()` method:
system.time(t(combn(nrow(m), 2, FUN = function(i) m[i[1],]- m[i[2],])))
#   user  system elapsed 
#  1.324   0.000   1.326 

## akrun's `apply()` method:
system.time({d1 <- expand.grid(1:nrow(m), 1:nrow(m));
rn <- do.call(paste, c(d1,  sep=";"));
res <- t(apply(d1, 1, function(i) m[i[1],] - m[i[2],]));
row.names(res) <- rn})
#   user  system elapsed 
#  4.768   0.000   4.777

500行根本不大，但速度差别很大。

如果您想进行测试，可以验证akrun这两种方法的时间是O(N^2)的二次方增长，而我的方法在O(N)线性增长。对于越来越大的N，我的方法的好处就越大。

计算行差异

2 个答案:

更新

基准