我有一个大矩阵:
set.seed(1)
a <- matrix(runif(9e+07),ncol=300)
我想对矩阵中的每一行进行排序:
> system.time(sorted <- t(apply(a,1,sort)))
user system elapsed
42.48 3.40 45.88
我有很多RAM可以使用,但我希望有一种更快的方法来执行此操作。
答案 0 :(得分:6)
好吧,我不知道有多种方法可以在R中快速排序,问题是你只需要排序300个值,但很多次。不过,您可以通过直接致电sort.int
并使用method='quick'
来获得额外的性能:
set.seed(1)
a <- matrix(runif(9e+07),ncol=300)
# Your original code
system.time(sorted <- t(apply(a,1,sort))) # 31 secs
# sort.int with method='quick'
system.time(sorted2 <- t(apply(a,1,sort.int, method='quick'))) # 27 secs
# using a for-loop is slightly faster than apply (and avoids transpose):
system.time({sorted3 <- a; for(i in seq_len(nrow(a))) sorted3[i,] <- sort.int(a[i,], method='quick') }) # 26 secs
但更好的方法应该是使用并行包来并行排序矩阵的各个部分。但是,传输数据的开销似乎太大,而且在我的机器上它开始交换,因为我“只”拥有8 GB的内存:
library(parallel)
cl <- makeCluster(4)
system.time(sorted4 <- t(parApply(cl,a,1,sort.int, method='quick'))) # Forever...
stopCluster(cl)
答案 1 :(得分:4)
包grr
包含一个备用排序方法,可以用来加速这个特定的操作(我已经稍微减少了矩阵大小,因此这个基准不会永远占用):
> set.seed(1)
> a <- matrix(runif(9e+06),ncol=300)
> microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
+ ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
+ ,sorted3 <- t(apply(a,1,grr::sort2)),times=3,unit='s')
Unit: seconds
expr min lq mean median uq max neval
sorted <- t(apply(a, 1, sort)) 1.7699799 1.865829 1.961853 1.961678 2.057790 2.153902 3
sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 1.6162934 1.619922 1.694914 1.623551 1.734224 1.844898 3
sorted3 <- t(apply(a, 1, grr::sort2)) 0.9316073 1.003978 1.050569 1.076348 1.110049 1.143750 3
当矩阵包含字符时,差异变得很大:
> set.seed(1)
> a <- matrix(sample(letters,size = 9e6,replace = TRUE),ncol=300)
> microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
+ ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
+ ,sorted3 <- t(apply(a,1,grr::sort2)),times=3)
Unit: seconds
expr min lq mean median uq max neval
sorted <- t(apply(a, 1, sort)) 15.436045 15.479742 15.552009 15.523440 15.609991 15.69654 3
sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 15.099618 15.340577 15.447823 15.581536 15.621925 15.66231 3
sorted3 <- t(apply(a, 1, grr::sort2)) 1.728663 1.733756 1.780737 1.738848 1.806774 1.87470 3
三者的结果相同。
> identical(sorted,sorted2,sorted3)
[1] TRUE
答案 2 :(得分:1)
马丁·摩根(Martin Morgan)的另一种出色方法,无需在Fastest way to select i-th highest value from row and assign to new column中使用任何外部软件包:
matrix(a[order(row(a), a)], ncol=ncol(a))
在同一链接中的注释下,还有按列排序的等效项。
使用与Craig相同的数据来计时代码:
set.seed(1)
a <- matrix(runif(9e7),ncol=300)
use_for <- function(){
sorted3 <- a
for(i in seq_len(nrow(a)))
sorted3[i,] <- sort.int(a[i,], method='quick')
sorted3
}
microbenchmark::microbenchmark(times=3L,
t(apply(a,1,sort)),
t(apply(a,1,sort.int, method='quick')),
use_for(),
Rfast::rowSort(a),
t(apply(a,1,grr::sort2)),
matrix(a[order(row(a), a)], ncol=ncol(a))
)
时间:
Unit: seconds
expr min lq mean median uq max neval
t(apply(a, 1, sort)) 37.875665 40.143190 41.098627 42.410715 42.710108 43.009502 3
t(apply(a, 1, sort.int, method = "quick")) 26.406063 27.146861 27.714226 27.887659 28.368307 28.848955 3
use_for() 20.038295 20.140692 20.504223 20.243088 20.737187 21.231285 3
Rfast::rowSort(a) 6.105679 6.460003 6.836455 6.814326 7.201844 7.589361 3
t(apply(a, 1, grr::sort2)) 11.912422 13.035231 13.667377 14.158040 14.544854 14.931669 3
matrix(a[order(row(a), a)], ncol = ncol(a)) 10.307094 10.789946 11.294119 11.272797 11.787632 12.302466 3
为了展示更完整的图片,对字符类进行另一项测试(不包括Rfast::rowSort
,因为它无法处理字符类):
set.seed(1)
a <- matrix(sample(letters, 9e6, TRUE),ncol=300)
microbenchmark::microbenchmark(times=1L,
t(apply(a,1,sort)),
t(apply(a,1,sort.int, method='quick')),
use_for(),
#Rfast::rowSort(a),
t(apply(a,1,grr::sort2)),
matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
)
时间:
Unit: milliseconds
expr min lq mean median uq max neval
t(apply(a, 1, sort)) 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951 1
t(apply(a, 1, sort.int, method = "quick")) 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711 1
use_for() 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827 1
t(apply(a, 1, grr::sort2)) 2539.1711 2539.1711 2539.1711 2539.1711 2539.1711 2539.1711 1
matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a)) 480.7405 480.7405 480.7405 480.7405 480.7405 480.7405 1
头对头:
set.seed(1)
a <- matrix(sample(letters, 9e7, TRUE),ncol=300)
microbenchmark::microbenchmark(times=1L,
t(apply(a,1,grr::sort2)),
matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
)
时间:
Unit: seconds
expr min lq mean median uq max neval
t(apply(a, 1, grr::sort2)) 29.098726 29.098726 29.098726 29.098726 29.098726 29.098726 1
matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a)) 7.067744 7.067744 7.067744 7.067744 7.067744 7.067744 1