我使用grouped_data_list
在列表foreach and dopar
的元素上运行循环。
运行时速度非常慢,而工作人员显然很忙。
如果我使用lapply
制作矢量化例程,并且没有并行,则需要几秒钟。我的dopar
有什么问题?
library(data.table)
library('doParallel') # parallel cpu implementation
library('foreach') # parallel looping
grouped_data_dt <- data.table(
Who=c("thdeg","mjg","dfdf","system","df","system","system","hegha","ydvw")
, DocumentExtension=c("jpg","com","dug","182","27","pdf","png","xslt","53")
, What_Action=c("added","removed","added","added","added","removed","added","added","added")
, Date=as.Date(c("2017-11-08","2017-10-10","2017-09-14","2017-09-20","2017-09-21","2017-10-20","2017-10-19","2017-08-24","2017-09-17"))
, Count=c(1,2,3,4,5,6,7,8,9)
)
reported_date_seq_dt <- data.table(
reported_date_seq = as.Date(c(
"2017-08-23","2017-08-24","2017-08-25","2017-08-26","2017-08-27","2017-08-28","2017-08-29","2017-08-30","2017-08-31","2017-09-01","2017-09-02"
,"2017-09-03","2017-09-04","2017-09-05","2017-09-06","2017-09-07","2017-09-08","2017-09-09","2017-09-10","2017-09-11","2017-09-12","2017-09-13"
,"2017-09-14","2017-09-15","2017-09-16","2017-09-17","2017-09-18","2017-09-19","2017-09-20","2017-09-21","2017-09-22","2017-09-23","2017-09-24"
,"2017-09-25","2017-09-26","2017-09-27","2017-09-28","2017-09-29","2017-09-30","2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05"
,"2017-10-06","2017-10-07","2017-10-08","2017-10-09","2017-10-10","2017-10-11","2017-10-12","2017-10-13","2017-10-14","2017-10-15","2017-10-16"
,"2017-10-17","2017-10-18","2017-10-19","2017-10-20","2017-10-21","2017-10-22","2017-10-23","2017-10-24","2017-10-25","2017-10-26","2017-10-27"
,"2017-10-28","2017-10-29","2017-10-30","2017-10-31","2017-11-01","2017-11-02","2017-11-03","2017-11-04","2017-11-05","2017-11-06","2017-11-07"
,"2017-11-08","2017-11-09","2017-11-10","2017-11-11","2017-11-12","2017-11-13","2017-11-14","2017-11-15","2017-11-16","2017-11-17","2017-11-18"
,"2017-11-19","2017-11-20","2017-11-21","2017-11-22","2017-11-23","2017-11-24","2017-11-25","2017-11-26","2017-11-27"
))
)
grouped_data_list <-
split(x = grouped_data_dt
, drop = T
, by = c("Who", "DocumentExtension", "What_Action")
, sorted = T
, keep.by = T
)
cl <- makeCluster(4)
registerDoParallel(cl)
## replace NA with zeros in the timeseries
grouped_data_list_2 <- list()
foreach(
i = 1:length(grouped_data_list)
) %dopar%
{
x <- grouped_data_list[[i]]
data.table::setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
grouped_data_list_2 <- c(grouped_data_list_2
, list(y)
)
}
stopCluster(cl)
lapply例程:
## after grouped_data_list is created
rm(group_replace_func)
group_replace_func <- function(x)
{
setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
return(y)
}
grouped_data_list_2 <- lapply(
grouped_data_list
, group_replace_func
)
快速运作的新版本(@ Roland的建议):
## parallel work
cl <- makeCluster(4)
registerDoParallel(cl)
## replace NA with zeros in the timeseries
grouped_data_list_2 <- list()
grouped_data_list_2 <- foreach(
x = grouped_data_list
) %dopar%
{
data.table::setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
y
}
stopCluster(cl)