因为在R中调用函数而导致vs lapply和时间增加

时间:2017-10-23 17:23:41

标签: r doparallel

我测试了以下代码(在300K行DF上),以确定哪个是R中并行化的最快方式(for loop vs lapply)。

Q1。这总是正确的(基于经过时间的比较)并行lapply比并行for循环更快?根据在线的不同帖子,我看到人们要么说#34; Duh!拉普利总是更快"或者"根据你的循环实现可以更快"。

Q2。更令人惊讶的是,通过调用函数来运行类似的代码(使代码看起来更干净)更慢。我是否正确地对它们进

我看到30K行的类似趋势。根据答案,我将看到并行化是否随着核心的增加而扩展。

感谢。

#Results:
[1] 300000      3
[1] "For loop all conditions"
    user   system  elapsed 
1040.232    8.767 1048.897 
[1] "Parallel For loop all conditions"
   user  system elapsed 
266.861   8.462 276.064 
[1] "Lapply all conditions"
   user  system elapsed 
 66.364   0.014  66.369 
[1] "ParLapply all conditions"
   user  system elapsed 
  0.413   0.113  25.890 
[1] "Lapply all conditions call function"
    user   system  elapsed 
5293.981  223.524 5517.128 
[1] "ParLapply all conditions call function"
    user   system  elapsed 
   0.492    0.082 1949.433 
[1] "For loop all conditions call function"
     user    system   elapsed 
10506.028    82.372 10587.585 
[1] "Parallel For loop all conditions call function"
    user   system  elapsed 
 585.387   29.322 2246.441 

#Code:  
d1 = c(1,2,-3)
d2 = c(1,-2,-2)
d3 = c(1,-2,-4)
d = data.frame(d1,d2,d3)
# making a big data frame for testing
s_df = d[rep(seq_len(nrow(d)), each=100000),]


correlThreshold = 0
total_numb_input_files = 3
rows_passing_consistency = c()


print("For loop all conditions")
system.time(
        for(idx in 1:nrow(s_df)){
            dfx = as.vector(unlist(s_df[idx, ,drop=T]))
            rname = rownames(s_df)[idx]
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }

            if((!is.na(neg)) && neg == (total_numb_input_files)){
                res = rname
            }
            rows_passing_consistency = append(rows_passing_consistency, res)
        }
)



print("Parallel For loop all conditions")
library(doParallel)
cl<-makeCluster(4, type="FORK")
registerDoParallel(cl)
system.time(
        foreach(idx = 1:nrow(s_df), .combine = c) %dopar% {
            dfx = as.vector(unlist(s_df[idx, ,drop=T]))
            rname = rownames(s_df)[idx]
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }
            if((!is.na(neg)) && neg == (total_numb_input_files)){
                 res = rname
            }
            res
        }
)
stopCluster(cl)



print("Lapply all conditions")
system.time(
  lapply(1:nrow(s_df) , 
        function(idx, s_df){
            dfx = as.vector(unlist(s_df[idx, ,drop=T]))
            rname = rownames(s_df)[idx]
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }

            if((!is.na(neg)) && neg == (total_numb_input_files)){
                res = rname
            }
            res         
        }
    , s_df
  )
)



print("ParLapply all conditions")
library(doParallel)
cl<-makeCluster(4, type="FORK")
#registerDoParallel(cl)
system.time(
  parLapply(cl, 1:nrow(s_df) , 
        function(idx, s_df){
            dfx = as.vector(unlist(s_df[idx, ,drop=T]))
            rname = rownames(s_df)[idx]
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }

            if((!is.na(neg)) && neg == (total_numb_input_files)){
                res = rname
            }
            res         
        }
    , s_df
  )
)
stopCluster(cl)





calc_consistency = function(rname, s_df){
            dfx = as.vector(unlist(s_df[rname, ,drop=T]))
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }
            if((!is.na(neg)) && neg == (total_numb_input_files)){
                 res = rname
            }
            return(res)
}

print("Lapply all conditions call function")
system.time(lapply(rownames(s_df), calc_consistency, s_df))

print("ParLapply all conditions call function")
library(doParallel)
cl<-makeCluster(4, type="FORK")
#registerDoParallel(cl)
system.time(parLapply(cl, rownames(s_df), calc_consistency, s_df))
stopCluster(cl)

print("For loop all conditions call function")
system.time(
for(rname in rownames(s_df)){
        rows_passing_consistency = append(rows_passing_consistency, calc_consistency(rname, s_df))
}
)

print("Parallel For loop all conditions call function")
library(doParallel)
cl<-makeCluster(4, type="FORK")
registerDoParallel(cl)
system.time(
foreach(rname=rownames(s_df), .combine = c) %dopar% {
        calc_consistency(rname, s_df)
}
)
stopCluster(cl)

1 个答案:

答案 0 :(得分:1)

事实证明,速度的主要差异是由于将“行索引”与“rownames”传递给apply函数。我尝试使用(l)应用内联和单独的函数调用,有和没有并行化。使用apply,内联与函数调用没有太大区别。并行化也同样有效。主要的时间延迟是由于传递索引与rownames,虽然我不确定为什么会发生这种情况。