%dopar%的data.table操作非常慢

时间:2018-01-15 13:57:55

标签: r data.table doparallel

我使用grouped_data_list在列表foreach and dopar的元素上运行循环。

运行时速度非常慢,而工作人员显然很忙。

如果我使用lapply制作矢量化例程,并且没有并行,则需要几秒钟。我的dopar有什么问题?

library(data.table)
library('doParallel') # parallel cpu implementation
library('foreach') # parallel looping

grouped_data_dt <- data.table(
Who=c("thdeg","mjg","dfdf","system","df","system","system","hegha","ydvw")
, DocumentExtension=c("jpg","com","dug","182","27","pdf","png","xslt","53")
, What_Action=c("added","removed","added","added","added","removed","added","added","added")
, Date=as.Date(c("2017-11-08","2017-10-10","2017-09-14","2017-09-20","2017-09-21","2017-10-20","2017-10-19","2017-08-24","2017-09-17"))
, Count=c(1,2,3,4,5,6,7,8,9)
)

reported_date_seq_dt <- data.table(
reported_date_seq = as.Date(c(
"2017-08-23","2017-08-24","2017-08-25","2017-08-26","2017-08-27","2017-08-28","2017-08-29","2017-08-30","2017-08-31","2017-09-01","2017-09-02"
,"2017-09-03","2017-09-04","2017-09-05","2017-09-06","2017-09-07","2017-09-08","2017-09-09","2017-09-10","2017-09-11","2017-09-12","2017-09-13"
,"2017-09-14","2017-09-15","2017-09-16","2017-09-17","2017-09-18","2017-09-19","2017-09-20","2017-09-21","2017-09-22","2017-09-23","2017-09-24"
,"2017-09-25","2017-09-26","2017-09-27","2017-09-28","2017-09-29","2017-09-30","2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05"
,"2017-10-06","2017-10-07","2017-10-08","2017-10-09","2017-10-10","2017-10-11","2017-10-12","2017-10-13","2017-10-14","2017-10-15","2017-10-16"
,"2017-10-17","2017-10-18","2017-10-19","2017-10-20","2017-10-21","2017-10-22","2017-10-23","2017-10-24","2017-10-25","2017-10-26","2017-10-27"
,"2017-10-28","2017-10-29","2017-10-30","2017-10-31","2017-11-01","2017-11-02","2017-11-03","2017-11-04","2017-11-05","2017-11-06","2017-11-07"
,"2017-11-08","2017-11-09","2017-11-10","2017-11-11","2017-11-12","2017-11-13","2017-11-14","2017-11-15","2017-11-16","2017-11-17","2017-11-18"
,"2017-11-19","2017-11-20","2017-11-21","2017-11-22","2017-11-23","2017-11-24","2017-11-25","2017-11-26","2017-11-27"
))
     )

grouped_data_list <- 
          split(x = grouped_data_dt
                , drop = T
                , by = c("Who", "DocumentExtension", "What_Action")
                , sorted = T
                , keep.by = T
          )


 cl <- makeCluster(4)
 registerDoParallel(cl)


 ## replace NA with zeros in the timeseries

 grouped_data_list_2 <- list()

 foreach(
      i = 1:length(grouped_data_list)
         ) %dopar%
 {

      x <- grouped_data_list[[i]]

      data.table::setkey(x, Date)

      dt_params <- unlist(
           x[1, -c('Date', 'Count'), with = F]
           )

      y <- x[reported_date_seq_dt]

      y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]

      y[is.na(Count), Count := 0]

      grouped_data_list_2 <- c(grouped_data_list_2
                               , list(y)
      )
 }

 stopCluster(cl)

lapply例程:

## after grouped_data_list is created

 rm(group_replace_func)

 group_replace_func <- function(x)
 {

      setkey(x, Date)

      dt_params <- unlist(
      x[1, -c('Date', 'Count'), with = F]
      )

      y <- x[reported_date_seq_dt]

      y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]

      y[is.na(Count), Count := 0]

      return(y)

 }

 grouped_data_list_2 <- lapply(
                          grouped_data_list
                          , group_replace_func
                          )

快速运作的新版本(@ Roland的建议):

## parallel work

     cl <- makeCluster(4)
     registerDoParallel(cl)


     ## replace NA with zeros in the timeseries

     grouped_data_list_2 <- list()

     grouped_data_list_2 <- foreach(
          x = grouped_data_list
             ) %dopar%
     {

          data.table::setkey(x, Date)

          dt_params <- unlist(
               x[1, -c('Date', 'Count'), with = F]
          )

          y <- x[reported_date_seq_dt]

          y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]

          y[is.na(Count), Count := 0]

          y

     }

     stopCluster(cl)

0 个答案:

没有答案