Question

我试图从具有“属于一起”的列的矩阵到已形成相关子矩阵的行总和的矩阵。即从

开始

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
[1,]    1    5    9   13   17   21   25   29   33    37    41    45    49    53    57    61
[2,]    2    6   10   14   18   22   26   30   34    38    42    46    50    54    58    62
[3,]    3    7   11   15   19   23   27   31   35    39    43    47    51    55    59    63
[4,]    4    8   12   16   20   24   28   32   36    40    44    48    52    56    60    64

到

     [,1] [,2] [,3] [,4] [,5]
[1,]   15   30   46  185  220
[2,]   18   32   48  190  224
[3,]   21   34   50  195  228
[4,]   24   36   52  200  232

我认为必须有一些更优雅，更快捷的方法来实现这一点，而不是像我在下面那样循环索引（特别是，我的真实矩阵将更像4000到数千）。

example <- matrix(1:64, nrow=4) myindex <- c(1,1,1,2,2,3,3,4,4,4,4,4,5,5,5,5) summed <- matrix( rep(unique(myindex), each=dim(example)[1]), nrow=dim(example)[1]) for (i in 1:length(unique(myindex))){ summed[,i] <- apply(X=example[,(myindex==i)], MARGIN=1, FUN=sum) }

可能是我缺乏应用和tapply的经验，这使我无法解决这个问题。当然也欢迎快速的dplyr方法。

Answer 1

我们可以使用sapply

的单行班次

sapply(unique(myindex), function(x) rowSums(example[, which(myindex == x), drop = FALSE]))

     [,1] [,2] [,3] [,4] [,5]
[1,]   15   30   46  185  220
[2,]   18   32   48  190  224
[3,]   21   34   50  195  228
[4,]   24   36   52  200  232

我们让sapply循环遍历myindex的所有唯一值，并使用which来定义应包含在rowSums中的列。

编辑：包含drop = FALSE以防止单个索引简化为向量。谢谢@ mt1022指出错误！

Answer 2

我们也可以通过split ting

执行此操作

sapply(split.default(as.data.frame(example), myindex), rowSums)
#     1  2  3   4   5
#[1,] 15 30 46 185 220
#[2,] 18 32 48 190 224
#[3,] 21 34 50 195 228
#[4,] 24 36 52 200 232

Answer 3

另一种方法......

example <- matrix(1:64, nrow=4)
myindex <- c(1,1,1,2,2,3,3,4,4,4,4,4,5,5,5,5)

summed <- t(apply(example,1,cumsum))
summed <- summed[,cumsum(rle(myindex)$lengths)]
summed[,-1] <- t(apply(summed,1,diff))
summed

     [,1] [,2] [,3] [,4] [,5]
[1,]   15   30   46  185  220
[2,]   18   32   48  190  224
[3,]   21   34   50  195  228
[4,]   24   36   52  200  232

Answer 4

矩阵乘法的替代方法（大数据集的效率较低）：

x <- matrix(0, nrow = ncol(example), ncol = max(myindex))
x[cbind(1:ncol(example), myindex)] <- 1
example %*% x

#      [,1] [,2] [,3] [,4] [,5]
# [1,]   15   30   46  185  220
# [2,]   18   32   48  190  224
# [3,]   21   34   50  195  228
# [4,]   24   36   52  200  232

以下是与实际数据大小匹配的示例数据的基准：

library(microbenchmark)

n_row <- 4000
n_col <- 3020
example <- matrix(rnorm(n_row * n_col), nrow = n_row)
myindex <- ceiling((1:n_col)/5)

microbenchmark(
    matrix = {
        x <- matrix(0, nrow = ncol(example), ncol = max(myindex))
        x[cbind(1:ncol(example), myindex)] <- 1
        example %*% x
    },
    split = {  # by akrun
        sapply(split.default(as.data.frame(example), myindex), rowSums)
    },
    which = {  # by LAP
        sapply(unique(myindex), function(x) rowSums(example[, which(myindex == x)]))
    },
    times = 10
)

# Unit: milliseconds
#    expr       min        lq     mean    median       uq      max neval
#  matrix 982.55727 989.65177 992.7295 992.91230 997.3704 999.0066    10
#   split 162.13377 162.57711 194.5668 167.92963 182.5335 403.8740    10
#   which  90.28227  94.82681 119.3977  96.03701 103.1125 316.9170    10

基于列索引对矩阵的行求和

4 个答案: