Question

我想将一个大矩阵 mt 拆分为子矩阵列表 res 。每个子矩阵的行数由向量 len 指定。

例如，

> mt=matrix(c(1:20),ncol=2)
> mt
      [,1] [,2]
 [1,]    1   11
 [2,]    2   12
 [3,]    3   13
 [4,]    4   14
 [5,]    5   15
 [6,]    6   16
 [7,]    7   17
 [8,]    8   18
 [9,]    9   19
[10,]   10   20

lens=c(2,3,5)

我想要的是一个函数 some_function ，它可以提供以下结果，

> res=some_function(mt,lens)

> res
[[1]]
     [,1] [,2]
[1,]    1   11
[2,]    2   12

[[2]]
     [,1] [,2]
[1,]    3   13
[2,]    4   14
[3,]    5   15

[[3]]
     [,1] [,2]
[1,]    6   16
[2,]    7   17
[3,]    8   18
[4,]    9   19
[5,]   10   20

速度是一个大问题。越快越好！

非常感谢！

Answer 1

根据每个值的长度创建索引并拆分矩阵的函数。

mt <- matrix(c(1:20), ncol=2)

# Two arguments: m - matrix, len - length of each group
m_split <- function(m, len){
  index <- 1:sum(len)
  group <- rep(1:length(len), times = len)
  index_list <- split(index, group)
  mt_list <- lapply(index_list, function(vec) mt[vec, ])
  return(mt_list)
}

m_split(mt, c(2, 3, 5))
$`1`
     [,1] [,2]
[1,]    1   11
[2,]    2   12

$`2`
     [,1] [,2]
[1,]    3   13
[2,]    4   14
[3,]    5   15

$`3`
     [,1] [,2]
[1,]    6   16
[2,]    7   17
[3,]    8   18
[4,]    9   19
[5,]   10   20

更新

我使用以下代码来比较这篇文章中每种方法的性能。

library(microbenchmark)
library(data.table)

# Test case from @missuse
mt <- matrix(c(1:20000000),ncol=10)
lens <- c(20000,15000,(nrow(mt)-20000-15000))

# Functions from @Damiano Fantini
split.df <- function(mt, lens) {
  fac <- do.call(c, lapply(1:length(lens), (function(i){ rep(i, lens[i])})))
  split(as.data.frame(mt), f = fac)
}

split.mat <- function(mt, lens) {
  fac <- do.call(c, lapply(1:length(lens), (function(i){ rep(i, lens[i])})))
  lapply(unique(fac), (function(i) {mt[fac==i,]}))
}

# Benchmarking
microbenchmark(m1 = {m_split(mt, lens)}, # @ycw's method 
               m2 = {pam = rep(1:length(lens), times = lens)
                     split(data.table(mt), pam)}, # @missuse's data.table method
               m3 = {split.df(mt, lens)}, # @Damiano Fantini's data frame method
               m4 = {split.mat(mt, lens)}) # @Damiano Fantini's matrix method

Unit: milliseconds
 expr      min       lq     mean   median       uq      max neval
   m1 167.6896 209.7746 251.0932 230.5920 274.9347 555.8839   100
   m2 402.3415 497.2397 554.1094 547.9603 599.7632 787.4112   100
   m3 552.8548 657.6245 719.2548 711.4123 769.6098 989.6779   100
   m4 166.6581 203.6799 249.2965 235.5856 275.4790 547.4927   100

我们可以看到，m1和m4是最快的，而它们之间几乎没有差异，这意味着不需要将matrix转换为{{{ 1}}或data frame，特别是如果OP将继续使用data.table。直接在matrix（matrix和m1）上工作就足够了。

Answer 2

一个aproach正在使用split，但它可以对矢量和data.frames进行操作，因此您需要转换矩阵 - data.table应该是高效的

mt=matrix(c(1:20),ncol=2)
lens=c(2,3,5)
pam = rep(1:length(lens), times = lens)
library(data.table)
mt_split <- split(data.table(mt), pam)
mt_split
#output
$`1`
   V1 V2
1:  1 11
2:  2 12

$`2`
   V1 V2
1:  3 13
2:  4 14
3:  5 15

$`3`
   V1 V2
1:  6 16
2:  7 17
3:  8 18
4:  9 19
5: 10 20

检查速度

mt=matrix(c(1:20000000),ncol=10)
lens=c(20000,15000,(nrow(mt)-20000-15000))
pam = rep(1:length(lens), times = lens)

system.time(split(data.table(mt), pam))
#output   
user  system elapsed 
0.75    0.20    0.96

Answer 3

如果您可以使用data.frames而不是矩阵，则可以根据lens构建分组因子/向量，然后使用split()。或者，使用此分组向量来对矩阵进行子集并返回列表。在这个例子中，我将两个解决方案包装成两个函数：。

# your data
mt=matrix(c(1:20),ncol=2)
lens=c(2,3,5)

# based on split
split.df <- function(mt, lens) {
  fac <- do.call(c, lapply(1:length(lens), (function(i){ rep(i, lens[i])})))
  split(as.data.frame(mt), f = fac)
}
split.df(mt, lens)

# based on subsetting
split.mat <- function(mt, lens) {
  fac <- do.call(c, lapply(1:length(lens), (function(i){ rep(i, lens[i])})))
  lapply(unique(fac), (function(i) {mt[fac==i,]}))
}
split.mat(mt, lens)

根据microbenchmark

，第二个选项比另一个选项快约10倍

library(microbenchmark)
microbenchmark({split.df(mt, lens)}, times = 1000)
# median = 323.743 microseconds
microbenchmark({split.mat(mt, lens)}, times = 1000)
# median = 31.7645 microseconds

基于行长度向量将矩阵转换为矩阵列表

3 个答案:

更新