我有一个要在大约300万个数据点上运行的功能。我正在尝试在具有8个内核的 Ubuntu 计算机上使用mcmapply
并行化该功能。该函数接收长度为300万的list
以及长度为300万的另外3个向量和1个常数cutoffyearmon
。
该代码在2分钟内在单个内核上可以完美处理100000行数据,并且不会引发任何错误。但是,当我尝试使用mcmapply
在计算机的6个内核上并行运行代码时,它将继续运行5个小时以上。
更新:这是我的函数调用的精简版本。我为1个月,2个月和3个月的持续时间创建了9个变量。我只将时间设为6个月和1年。
我正在使用以下函数调用:
abc_xx_last_xxx_days=mcmapply(function(abcstrnew,sd,naflag,empflag,daysdiff,cutoffyearmon){
abcstrnew=if((!naflag) & (!empflag)){
substring(text = abcstrnew,first = seq(from = 1,to = (nchar(abcstrnew)-2),by = 3),last = seq(from = 3,to = (nchar(abcstrnew)),by = 3))
}else{
if(!is.na(empflag) & empflag){
""
}else{
NA_character_
}
}
abcstrnew=if((!naflag) & (!empflag)){
as.numeric(abcstrnew)
}else{
if(!is.na(empflag) & empflag){
as.numeric(0)
}else{
NA_real_
}
}
if(is.na(daysdiff)){
return(list(worst_abc_ever=NA_real_,
times_abc=NA_real_,
times_abc_last_180_days=NA_real_,
times_abc_last_365_days=NA_real_,
times_abc30_last_365_days=NA_real_,
times_abc30_last_180_days=NA_real_,
times_abc60_last_365_days=NA_real_,
times_abc60_last_180_days=NA_real_,
abc_last_180_days=NA_real_,
abc_last_365_days=NA_real_
))
}else{
if((!naflag)&(!empflag)){
abcstrlen=length(abcstrnew)
worst_abc_ever=max(abcstrnew)
times_abc=as.numeric(length(which(abcstrnew>0)))
if(daysdiff>365){
abc_last_365_days=as.numeric(0)
times_abc30_last_365_days=as.numeric(0)
times_abc60_last_365_days=as.numeric(0)
times_abc_last_365_days=as.numeric(0)
}else{
abcmonthstwelve=12-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)
if(abcstrlen>=abcmonthstwelve){
abc_last_365_days=(max(abcstrnew[1:abcmonthstwelve]))
}else{
abc_last_365_days=(max(abcstrnew[1:abcstrlen]))
}
if(abcstrlen>=abcmonthstwelve){
times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=30)))
}else{
times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
}
if(abcstrlen>=abcmonthstwelve){
times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=60)))
}else{
times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
}
if(abcstrlen>=abcmonthstwelve){
times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>0)))
}else{
times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
}
}
if(daysdiff>180){
abc_last_180_days=as.numeric(0)
times_abc30_last_180_days=as.numeric(0)
times_abc60_last_180_days=as.numeric(0)
times_abc_last_180_days=as.numeric(0)
}else{
abcmonthssix=6-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)
if(abcstrlen>=abcmonthssix){
abc_last_180_days=(max(abcstrnew[1:abcmonthssix]))
}else{
abc_last_180_days=(max(abcstrnew[1:abcstrlen]))
}
if(abcstrlen>=abcmonthssix){
times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=30)))
}else{
times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
}
if(abcstrlen>=abcmonthssix){
times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=60)))
}else{
times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
}
if(abcstrlen>=abcmonthssix){
times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>0)))
}else{
times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
}
}
return(list(worst_abc_ever=worst_abc_ever,
times_abc=times_abc,
times_abc_last_180_days=times_abc_last_180_days,
times_abc_last_365_days=times_abc_last_365_days,
times_abc30_last_365_days=times_abc30_last_365_days,
times_abc30_last_180_days=times_abc30_last_180_days,
times_abc60_last_365_days=times_abc60_last_365_days,
times_abc60_last_180_days=times_abc60_last_180_days,
abc_last_180_days=abc_last_180_days,
abc_last_365_days=abc_last_365_days
))
}else{
return(list(worst_abc_ever=NA_real_,
times_abc=NA_real_,
times_abc_last_180_days=NA_real_,
times_abc_last_365_days=NA_real_,
times_abc30_last_365_days=NA_real_,
times_abc30_last_180_days=NA_real_,
times_abc60_last_365_days=NA_real_,
times_abc60_last_180_days=NA_real_,
abc_last_180_days=NA_real_,
abc_last_365_days=NA_real_
))
}
}
},lst,sd,naflag,empflag,daysdiff,cutoffyearmon,mc.cores=6, mc.preschedule=TRUE, mc.cleanup=TRUE)
您可以使用以下输入集来运行该功能并检查其输出。
lst=list("000050000032","000000340000000000000")
sd=c(as.Date.character("2017-05-22"),as.Date.character("2017-04-23"))
empflag=c(FALSE,FALSE)
naflag=c(FALSE,FALSE)
daysdiff=difftime(time1 = as.Date.character("2017-06-30"),time2 = sd)
cutoffyearmon=as.yearmon("2017-06-30")
我假设代码将通过分配mc.preschedule=TRUE
在6个内核之间几乎相等地划分数据。但是我看不到任何处理速度方面的显着性能。我预计在该计算机的6个内核上运行时,处理将在1.5个小时左右完成。
如果我错过了任何东西,任何建议。
将pbmcmapply与mc.cores=6
一起使用时,我的预计到达时间为06:01:32:57