并行选项给出不同的结果

时间:2012-10-09 04:17:47

标签: r parallel-processing plyr

这与我之前的问题mclapply vs for loops for plotting: speed and scalability focus有关,我想我现在有三种/四种方法可以做同样的事情,但即使在设置随机种子时也会得到不同的结果。

首先......为什么结果不同......

results2results3有效吗?

虽然未在第一个示例中显示(如下所示)...有时候results2比不使用.parrallel = TRUE选项需要更长时间,为什么会这样?

> rm(list=ls())
> gc()
         used (Mb) gc trigger  (Mb) max used  (Mb)
Ncells 253264 13.6    1801024  96.2  1643320  87.8
Vcells 829208  6.4   22407472 171.0 28009341 213.7
> require(ggplot2)
Loading required package: ggplot2
> require(plyr)
Loading required package: plyr
> require(foreach)
Loading required package: foreach
foreach: simple, scalable parallel programming from Revolution Analytics
Use Revolution R for scalability, fault tolerance and more.
http://www.revolutionanalytics.com
> require(doMC)
Loading required package: doMC   
Loading required package: iterators
Loading required package: multicore
> registerDoMC(cores=4)
> gc()
          used (Mb) gc trigger  (Mb) max used  (Mb)
Ncells  409261 21.9    1440819  77.0  1643320  87.8
Vcells 1039134  8.0   17925977 136.8 28009341 213.7
> df <- expand.grid(i = 1:100, j = 1:2 , k = seq(100, 500, 100))
> params <- mapply(list, n = df[, 3], mu = df[, 1], stdev = df[,2], SIMPLIFY = F)
> ff <- function(tlist) {
+     set.seed(123)
+     n <- tlist$n
+     mu <- tlist$mu
+     stdev <- tlist$stdev
+     x1 <- c(1:n)
+     y1 <- rnorm(n,mu,stdev)
+     z1 <- data.frame(cbind(x1,y1))
+     ggplot(z1, aes(x=x1,y=y1))+
+         geom_point()+
+         labs(title=paste("n=",n,"mu=",mu, "stdev=",stdev))
+ }
> system.time(results <- llply(params, ff))
   user  system elapsed 
  5.363   0.009   5.368 
> system.time(results2 <- llply(params, ff,.parallel=TRUE))
   user  system elapsed 
  2.689   0.259   2.938 
> system.time(results3 <- mclapply(params, ff, mc.cores = 4, mc.preschedule = TRUE))
   user  system elapsed 
  7.488   0.685   2.501 
> identical(results,results2)
[1] FALSE
> identical(results,results3)
[1] FALSE
> identical(results2,results3)
[1] FALSE

比较

require(rbenchmark)
benchmark(results <- llply(params, ff),
          results2 <- llply(params, ff,.parallel=TRUE), 
           results3 <- mclapply(params, ff, mc.cores = 4, mc.preschedule = TRUE),
             replications=5 )

给出以下结果:

                                                                    test replications elapsed relative user.self sys.self user.child sys.child
 1                                          results <- llply(params, ff)            5  27.869    1.521    27.833    0.043      0.000     0.000
 2                       results2 <- llply(params, ff, .parallel = TRUE)            5 109.990    6.003     5.455    2.472     37.565     7.048
 3 results3 <- mclapply(params, ff, mc.cores = 4, mc.preschedule = TRUE)            5  18.322    1.000     1.582    1.545     42.730    10.441

在进行10次重复而不仅仅是5次时更奇怪!

 benchmark(results <- llply(params, ff),
      results2 <- llply(params, ff,.parallel=TRUE), 
      results3 <- mclapply(params, ff, mc.cores = 4, mc.preschedule = TRUE),
       replications=10 )

你明白了:

                                                                     test replications elapsed relative user.self sys.self user.child sys.child
  1                                          results <- llply(params, ff)           10  55.031    1.000    54.641    0.144      0.000     0.000
  2                       results2 <- llply(params, ff, .parallel = TRUE)           10 107.801    1.959     9.877    6.045     80.022    23.062
  3 results3 <- mclapply(params, ff, mc.cores = 4, mc.preschedule = TRUE)           10 297.556    5.407     3.576    5.624     96.493    31.035

这里发生了什么?

0 个答案:

没有答案