我测试了以下代码(在300K行DF上),以确定哪个是R中并行化的最快方式(for loop vs lapply)。
Q1。这总是正确的(基于经过时间的比较)并行lapply比并行for循环更快?根据在线的不同帖子,我看到人们要么说#34; Duh!拉普利总是更快"或者"根据你的循环实现可以更快"。
Q2。更令人惊讶的是,通过调用函数来运行类似的代码(使代码看起来更干净)更慢。我是否正确地对它们进
我看到30K行的类似趋势。根据答案,我将看到并行化是否随着核心的增加而扩展。
感谢。
#Results:
[1] 300000 3
[1] "For loop all conditions"
user system elapsed
1040.232 8.767 1048.897
[1] "Parallel For loop all conditions"
user system elapsed
266.861 8.462 276.064
[1] "Lapply all conditions"
user system elapsed
66.364 0.014 66.369
[1] "ParLapply all conditions"
user system elapsed
0.413 0.113 25.890
[1] "Lapply all conditions call function"
user system elapsed
5293.981 223.524 5517.128
[1] "ParLapply all conditions call function"
user system elapsed
0.492 0.082 1949.433
[1] "For loop all conditions call function"
user system elapsed
10506.028 82.372 10587.585
[1] "Parallel For loop all conditions call function"
user system elapsed
585.387 29.322 2246.441
#Code:
d1 = c(1,2,-3)
d2 = c(1,-2,-2)
d3 = c(1,-2,-4)
d = data.frame(d1,d2,d3)
# making a big data frame for testing
s_df = d[rep(seq_len(nrow(d)), each=100000),]
correlThreshold = 0
total_numb_input_files = 3
rows_passing_consistency = c()
print("For loop all conditions")
system.time(
for(idx in 1:nrow(s_df)){
dfx = as.vector(unlist(s_df[idx, ,drop=T]))
rname = rownames(s_df)[idx]
res = NULL
#print(dfx)
pos = sum(dfx > correlThreshold)
neg = sum(dfx < correlThreshold)
if((!is.na(pos)) && pos == (total_numb_input_files)){
res = rname
}
if((!is.na(neg)) && neg == (total_numb_input_files)){
res = rname
}
rows_passing_consistency = append(rows_passing_consistency, res)
}
)
print("Parallel For loop all conditions")
library(doParallel)
cl<-makeCluster(4, type="FORK")
registerDoParallel(cl)
system.time(
foreach(idx = 1:nrow(s_df), .combine = c) %dopar% {
dfx = as.vector(unlist(s_df[idx, ,drop=T]))
rname = rownames(s_df)[idx]
res = NULL
#print(dfx)
pos = sum(dfx > correlThreshold)
neg = sum(dfx < correlThreshold)
if((!is.na(pos)) && pos == (total_numb_input_files)){
res = rname
}
if((!is.na(neg)) && neg == (total_numb_input_files)){
res = rname
}
res
}
)
stopCluster(cl)
print("Lapply all conditions")
system.time(
lapply(1:nrow(s_df) ,
function(idx, s_df){
dfx = as.vector(unlist(s_df[idx, ,drop=T]))
rname = rownames(s_df)[idx]
res = NULL
#print(dfx)
pos = sum(dfx > correlThreshold)
neg = sum(dfx < correlThreshold)
if((!is.na(pos)) && pos == (total_numb_input_files)){
res = rname
}
if((!is.na(neg)) && neg == (total_numb_input_files)){
res = rname
}
res
}
, s_df
)
)
print("ParLapply all conditions")
library(doParallel)
cl<-makeCluster(4, type="FORK")
#registerDoParallel(cl)
system.time(
parLapply(cl, 1:nrow(s_df) ,
function(idx, s_df){
dfx = as.vector(unlist(s_df[idx, ,drop=T]))
rname = rownames(s_df)[idx]
res = NULL
#print(dfx)
pos = sum(dfx > correlThreshold)
neg = sum(dfx < correlThreshold)
if((!is.na(pos)) && pos == (total_numb_input_files)){
res = rname
}
if((!is.na(neg)) && neg == (total_numb_input_files)){
res = rname
}
res
}
, s_df
)
)
stopCluster(cl)
calc_consistency = function(rname, s_df){
dfx = as.vector(unlist(s_df[rname, ,drop=T]))
res = NULL
#print(dfx)
pos = sum(dfx > correlThreshold)
neg = sum(dfx < correlThreshold)
if((!is.na(pos)) && pos == (total_numb_input_files)){
res = rname
}
if((!is.na(neg)) && neg == (total_numb_input_files)){
res = rname
}
return(res)
}
print("Lapply all conditions call function")
system.time(lapply(rownames(s_df), calc_consistency, s_df))
print("ParLapply all conditions call function")
library(doParallel)
cl<-makeCluster(4, type="FORK")
#registerDoParallel(cl)
system.time(parLapply(cl, rownames(s_df), calc_consistency, s_df))
stopCluster(cl)
print("For loop all conditions call function")
system.time(
for(rname in rownames(s_df)){
rows_passing_consistency = append(rows_passing_consistency, calc_consistency(rname, s_df))
}
)
print("Parallel For loop all conditions call function")
library(doParallel)
cl<-makeCluster(4, type="FORK")
registerDoParallel(cl)
system.time(
foreach(rname=rownames(s_df), .combine = c) %dopar% {
calc_consistency(rname, s_df)
}
)
stopCluster(cl)
答案 0 :(得分:1)
事实证明,速度的主要差异是由于将“行索引”与“rownames”传递给apply函数。我尝试使用(l)应用内联和单独的函数调用,有和没有并行化。使用apply,内联与函数调用没有太大区别。并行化也同样有效。主要的时间延迟是由于传递索引与rownames,虽然我不确定为什么会发生这种情况。