Question

我使用foreach + doParallel将函数应用于R中矩阵多线程的每一行。当矩阵有很多行时，foreach需要在多线程遍历迭代之前和之后的很长一段时间。

例如，如果我跑：

library(foreach)
library(doParallel)

doWork <- function(data) {

  # setup parallel backend to use many processors
  cores=detectCores()
  number_of_cores_to_use = cores[1]-1 # not to overload the computer
  cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
  cl <- makeCluster(number_of_cores_to_use) 
  clusterExport(cl=cl, varlist=c('ns','weights'))
  registerDoParallel(cl)
  cat('...Starting foreach initialization')

  output <- foreach(i=1:length(data[,1]), .combine=rbind) %dopar% {
    cat(i)
    y = data[i,5]
    a = 100
    for (i in 1:3) { # Useless busy work
      b=matrix(runif(a*a), nrow = a, ncol=a)
    }
    return(runif(10))

  }
  # stop cluster
  cat('...Stop cluster')
  stopCluster(cl)

  return(output)
}

r = 100000
c = 10
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data)
output[1:10,]

CPU使用率如下（100％表示所有核心都已充分利用）：

带注释的

：

如何优化代码，以便在多线程遍历迭代之前和之后foreach不需要很长时间？主要的时间接收是花费的时间。随着foreach迭代次数的增加，花费的时间显着增加，有时会使代码变慢，就像使用简单的for循环一样。

另一个例子（让我们假设lm和poly不能将矩阵作为参数）：

library(foreach)
library(doParallel)

doWork <- function(data,weights) {

  # setup parallel backend to use many processors
  cores=detectCores()
  number_of_cores_to_use = cores[1]-1 # not to overload the computer
  cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
  cl <- makeCluster(number_of_cores_to_use) 
  clusterExport(cl=cl, varlist=c('weights'))
  registerDoParallel(cl)
  cat('...Starting foreach initialization')

  output <- foreach(i=1:nrow(data), .combine=rbind) %dopar% {
    x = sort(data[i,])
    fit = lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = 2,raw=TRUE), na.action=na.omit, weights=weights)
    return(fit$coef)
  }
  # stop cluster
  cat('...Stop cluster')
  stopCluster(cl)

  return(output)
}

r = 10000 
c = 10
weights=runif(c-1)
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data,weights)
output[1:10,]

Answer 1

试试这个：

this.setState((prevState, props) => ({
    counter: prevState.counter + 1
}));

这在我的计算机上只有两倍（它只有4个核心）。备注：

使用超过一半核心通常是无用的。
您的数据不是很大，因此使用devtools::install_github("privefl/bigstatsr") library(bigstatsr) options(bigstatsr.ncores.max = parallel::detectCores()) doWork2 <- function(data, weights, ncores = parallel::detectCores() - 1) { big_parallelize(data, p.FUN = function(X.desc, ind, weights) { X <- bigstatsr::attach.BM(X.desc) output.part <- matrix(0, 3, length(ind)) for (i in seq_along(ind)) { x <- sort(X[, ind[i]]) fit <- lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = 2, raw = TRUE), na.action = na.omit, weights = weights) output.part[, i] <- fit$coef } t(output.part) }, p.combine = "rbind", ncores = ncores, weights = weights) } system.time({ data.bm <- as.big.matrix(t(data)) output2 <- doWork2(data.bm, weights) }) all.equal(output, output2, check.attributes = FALSE)在此处可能没用。
big.matrix将big_parallelize列的列中的矩阵分开并在每个列上应用您的函数，然后合并结果。
在函数中，最好在循环之前输出，然后填充它，而不是使用ncores foreach所有结果。
我只访问列而不是行。

所有这些都是很好的做法，但它与您的数据并不相关。使用更多内核和更大的数据集时，增益应该更高。

基本上，如果你想超快，重新实现Rcpp中的rbind部分将是一个很好的解决方案。

Answer 2

正如F.Privé在评论中提到的那样：

我认为问题在于rbind。 rbind列表中的大量值需要很长时间。此外，填充行很糟糕，因为矩阵按列存储。此外，制作一个长的foreach循环效率不高（改为使用块）。

使用use blocks（如果使用5个核心，每个核心接收20％的矩阵）：

library(foreach)
library(doParallel)


array_split <- function(data, number_of_chunks) {
  # [Partition matrix into N equally-sized chunks with R](https://stackoverflow.com/a/45198299/395857)
  # Author: lmo
  rowIdx <- seq_len(nrow(data))
  lapply(split(rowIdx, cut(rowIdx, pretty(rowIdx, number_of_chunks))), function(x) data[x, ])
}


doWork <- function(data) {

  # setup parallel backend to use many processors
  cores=detectCores()
  number_of_cores_to_use = cores[1]-1 # not to overload the computer
  cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
  cl <- makeCluster(number_of_cores_to_use) 
  clusterExport(cl=cl, varlist=c('ns','weights'))
  registerDoParallel(cl)

  cat('...Starting array split')
  number_of_chunks = number_of_cores_to_use
  data_chunks = array_split(data=data, number_of_chunks=number_of_chunks)
  degree_poly = 2

  cat('...Starting foreach initialization')
  output <- foreach(i=1:length(data_chunks), .combine=rbind) %dopar% {

    data_temporary = data_chunks[[i]]
    output_temporary = matrix(0, nrow=nrow(data_temporary), ncol = degree_poly + 1)
    for(i in 1:length(data_temporary[,1])) {
      x = sort(data_temporary[i,])
      fit = lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = degree_poly,raw=TRUE), na.action=na.omit, weights=weights)
      output_temporary[i,] = fit$coef
    }
    return(output_temporary)
  }

  # stop cluster
  cat('...Stop cluster')
  stopCluster(cl)

  return(output)
}

r = 100000
c = 10
weights=runif(c-1)
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data)
output[1:10,]

仅供参考：

如何在多线程遍历迭代之前和之后减少前期的时间？

2 个答案: