Question

我有一个大矩阵：

set.seed(1)
a <- matrix(runif(9e+07),ncol=300)

我想对矩阵中的每一行进行排序：

> system.time(sorted <- t(apply(a,1,sort)))
   user  system elapsed 
  42.48    3.40   45.88

我有很多RAM可以使用，但我希望有一种更快的方法来执行此操作。

Answer 1

好吧，我不知道有多种方法可以在R中快速排序，问题是你只需要排序300个值，但很多次。不过，您可以通过直接致电sort.int并使用method='quick'来获得额外的性能：

set.seed(1)
a <- matrix(runif(9e+07),ncol=300)

# Your original code
system.time(sorted <- t(apply(a,1,sort))) # 31 secs

# sort.int with method='quick'
system.time(sorted2 <- t(apply(a,1,sort.int, method='quick'))) # 27 secs

# using a for-loop is slightly faster than apply (and avoids transpose):
system.time({sorted3 <- a; for(i in seq_len(nrow(a))) sorted3[i,] <- sort.int(a[i,], method='quick') }) # 26 secs

但更好的方法应该是使用并行包来并行排序矩阵的各个部分。但是，传输数据的开销似乎太大，而且在我的机器上它开始交换，因为我“只”拥有8 GB的内存：

library(parallel)
cl <- makeCluster(4)
system.time(sorted4 <- t(parApply(cl,a,1,sort.int, method='quick'))) # Forever...
stopCluster(cl)

Answer 2

包grr包含一个备用排序方法，可以用来加速这个特定的操作（我已经稍微减少了矩阵大小，因此这个基准不会永远占用）：

> set.seed(1)
> a <- matrix(runif(9e+06),ncol=300)
> microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
+                                ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
+                                ,sorted3 <- t(apply(a,1,grr::sort2)),times=3,unit='s')
Unit: seconds
                                                  expr       min       lq     mean   median       uq      max neval
                        sorted <- t(apply(a, 1, sort)) 1.7699799 1.865829 1.961853 1.961678 2.057790 2.153902     3
 sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 1.6162934 1.619922 1.694914 1.623551 1.734224 1.844898     3
                 sorted3 <- t(apply(a, 1, grr::sort2)) 0.9316073 1.003978 1.050569 1.076348 1.110049 1.143750     3

当矩阵包含字符时，差异变得很大：

> set.seed(1)
> a <- matrix(sample(letters,size = 9e6,replace = TRUE),ncol=300)
> microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
+                                ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
+                                ,sorted3 <- t(apply(a,1,grr::sort2)),times=3)
Unit: seconds
                                                  expr       min        lq      mean    median        uq      max neval
                        sorted <- t(apply(a, 1, sort)) 15.436045 15.479742 15.552009 15.523440 15.609991 15.69654     3
 sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 15.099618 15.340577 15.447823 15.581536 15.621925 15.66231     3
                 sorted3 <- t(apply(a, 1, grr::sort2))  1.728663  1.733756  1.780737  1.738848  1.806774  1.87470     3

三者的结果相同。

> identical(sorted,sorted2,sorted3)
[1] TRUE

Answer 3

马丁·摩根（Martin Morgan）的另一种出色方法，无需在Fastest way to select i-th highest value from row and assign to new column中使用任何外部软件包：

matrix(a[order(row(a), a)], ncol=ncol(a))

在同一链接中的注释下，还有按列排序的等效项。

使用与Craig相同的数据来计时代码：

set.seed(1)
a <- matrix(runif(9e7),ncol=300)

use_for <- function(){
    sorted3 <- a
    for(i in seq_len(nrow(a))) 
        sorted3[i,] <- sort.int(a[i,], method='quick') 
    sorted3
}

microbenchmark::microbenchmark(times=3L,
    t(apply(a,1,sort)),
    t(apply(a,1,sort.int, method='quick')),
    use_for(),
    Rfast::rowSort(a),
    t(apply(a,1,grr::sort2)),
    matrix(a[order(row(a), a)], ncol=ncol(a))
)

时间：

Unit: seconds
                                        expr       min        lq      mean    median        uq       max neval
                        t(apply(a, 1, sort)) 37.875665 40.143190 41.098627 42.410715 42.710108 43.009502     3
  t(apply(a, 1, sort.int, method = "quick")) 26.406063 27.146861 27.714226 27.887659 28.368307 28.848955     3
                                   use_for() 20.038295 20.140692 20.504223 20.243088 20.737187 21.231285     3
                           Rfast::rowSort(a)  6.105679  6.460003  6.836455  6.814326  7.201844  7.589361     3
                  t(apply(a, 1, grr::sort2)) 11.912422 13.035231 13.667377 14.158040 14.544854 14.931669     3
 matrix(a[order(row(a), a)], ncol = ncol(a)) 10.307094 10.789946 11.294119 11.272797 11.787632 12.302466     3

为了展示更完整的图片，对字符类进行另一项测试（不包括Rfast::rowSort，因为它无法处理字符类）：

set.seed(1)
a <- matrix(sample(letters, 9e6, TRUE),ncol=300)

microbenchmark::microbenchmark(times=1L,
    t(apply(a,1,sort)),
    t(apply(a,1,sort.int, method='quick')),
    use_for(),
    #Rfast::rowSort(a),
    t(apply(a,1,grr::sort2)),
    matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
)

时间：

Unit: milliseconds
                                                          expr        min         lq       mean     median         uq        max neval
                                          t(apply(a, 1, sort)) 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951     1
                    t(apply(a, 1, sort.int, method = "quick")) 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711     1
                                                     use_for() 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827     1
                                    t(apply(a, 1, grr::sort2))  2539.1711  2539.1711  2539.1711  2539.1711  2539.1711  2539.1711     1
 matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a))   480.7405   480.7405   480.7405   480.7405   480.7405   480.7405     1

头对头：

set.seed(1)
a <- matrix(sample(letters, 9e7, TRUE),ncol=300)
microbenchmark::microbenchmark(times=1L,
    t(apply(a,1,grr::sort2)),
    matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
)

时间：

Unit: seconds
                                                          expr       min        lq      mean    median        uq       max neval
                                    t(apply(a, 1, grr::sort2)) 29.098726 29.098726 29.098726 29.098726 29.098726 29.098726     1
 matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a))  7.067744  7.067744  7.067744  7.067744  7.067744  7.067744     1

在R中对大型矩阵的每一行进行排序的最快方法

3 个答案: