R:for fore循环中的循环

时间:2018-03-06 17:28:05

标签: r foreach parallel-processing doparallel

编辑:减小数据集的大小

示例数据:

df <- data.frame(loc.id = rep(1:10, each = 80*36), 
             year = rep(rep(1980:2015, each = 80), times = 10),
             day = rep(rep(1:80, times = 36),times = 10),
             rain = runif(10*36*80, min = 0 , max = 5),
             swc = runif(10*36*80,min = 0, max = 50),
             SW.max = rep(runif(10, min = 100, max = 200), each = 80*36),
             SW.ini = runif(10*36*80),
             PETc = runif(10*36*80, min = 0 , max = 1.3),
             SW = NA,
             PAW = NA, 
             aetc = NA)

df包含10个地点的1980 - 2015年每日数据(80天)。 对于每个位置X年组合,我想进行以下计算

list.result <- list() # create a list to store all results
ptm <- proc.time()
n <- 0

for(i in seq_along(unique(df$loc.id))){

location <- unique(df$loc.id)[i]
print(location)

for(j in seq_along(unique(df$year))){

yr <- unique(df$year)[j]
print(yr)

df_year <- df[df$loc.id == location & df$year == yr,] # subset data for location i and year y

# for the first row of data frame, i need to calculate some values 
SW.ini <- df_year$SW.ini[1] 
SW.max <- df_year$SW.max[1]

df_year$PAW[1] <- SW.ini + df_year$rain[1]
df_year$aetc[1] <- ifelse(df_year$PAW[1] >= df_year$swc[1], 
df_year$PETc[1],(df_year$PAW[1]/df_year$swc[1])*df_year$PETc[1])
df_year$aetc[1] <- ifelse(df_year$aetc[1] > df_year$PAW[1], df_year$PAW[1], df_year$aetc[1])
df_year$SW[1] <- SW.ini + df_year$rain[1] -  df_year$aetc[1]
df_year$SW[1] <- ifelse(df_year$SW[1] > SW.max, SW.max, ifelse(df_year$SW[1] < 0, 0,df_year$SW[1]))

# for row 2 till row n of df_year, I need to do this:
for (day in 2:nrow(df_year)){
df_year$PAW[day] <- df_year$SW[day - 1] + df_year$rain[day]

df_year$aetc[day] <- ifelse(df_year$PAW[day] >= df_year$swc[day], df_year$PETc[day], (df_year$PAW[day]/df_year$swc[day]) * df_year$PETc[day])

df_year$aetc[day] <- ifelse(df_year$aetc[day] > df_year$PAW[day], df_year$PAW[day],df_year$aetc[day])

df_year$SW[day] <- df_year$SW[day - 1] + df_year$rain[day] -  df_year$aetc[day]

df_year$SW[day] <- ifelse(df_year$SW[day] > SW.max,SW.max, ifelse(df_year$SW[day] < 0, 0,df_year$SW[day]))

   }
n <- n + 1
list.result[[n]] <- df_year
}}
proc.time() - ptm
user  system elapsed 
8.64    0.00    8.75

final.dat <- rbindlist(list.result)

这个循环是顺序的,我认为它是R中foreach的一个很好的候选者。我还没有真正合作过 foreach所以做一些在线研究让我想到了这个:

  library(doParallel)
  cl <- makeCluster(4) # if I understood this correctly, it assings number of cores to be used 
  registerDoParallel(cl)

  foreach(i = seq_along(unique(df$loc.id)) %dopar% {
    list.result <- list()
    for(j in seq_along(1980:2015)){

      df_year <- df[df$loc.id == unique(df$loc.id)[i] & df$year == unique(df$year)[j],] # subset data for location i and year y

      # for the first row of data frame, i need to calculate some values 
      SW.ini <- df_year$SW.ini[1] 
      SW.max <- df_year$SW.max[1]

      df_year$PAW[1] <- SW.ini + df_year$rain[1]
      df_year$aetc[1] <- ifelse(df_year$PAW[1] >= df_year$swc[1], df_year$PETc[1],(df_year$PAW[1]/df_year$swc[1])*df_year$PETc[1])
      df_year$aetc[1] <- ifelse(df_year$aetc[1] > df_year$PAW[1], df_year$PAW[1], df_year$aetc[1])
      df_year$SW[1] <- SW.ini + df_year$rain[1] -  df_year$aetc[1]
      df_year$SW[1] <- ifelse(df_year$SW[1] > SW.max, SW.max, ifelse(df_year$SW[1] < 0, 0,df_year$SW[1]))

      # for row 2 till row n of df_year, I need to do this:
      for (day in 2:nrow(df_year)){
        df_year$PAW[day] <- df_year$SW[day - 1] + df_year$rain[day]
        df_year$aetc[day] <- ifelse(df_year$PAW[day] >= df_year$swc[day], df_year$PETc[day], (df_year$PAW[day]/df_year$swc[day]) * df_year$PETc[day])
        df_year$aetc[day] <- ifelse(df_year$aetc[day] > df_year$PAW[day], df_year$PAW[day],df_year$aetc[day])
        df_year$SW[day] <- df_year$SW[day - 1] + df_year$rain[day] -  df_year$aetc[day]
        df_year$SW[day] <- ifelse(df_year$SW[day] > SW.max,SW.max, ifelse(df_year$SW[day] < 0, 0,df_year$SW[day]))

      }
      list.result[[j]] <- df_year
    }
    dat <- rbindlist(list.result)
    fwrite(dat,paste0(i,"dat.csv"))
 }

我的问题是:

1)以上数据是否适合foreach

2)foreach中有一个for循环。这有意义吗?

3)如何使上述foreach运行并返回所有结果

2 个答案:

答案 0 :(得分:3)

解决您的三个问题:

  1. 我不这么认为。 (计算效率更高的方法可以完全消除增加更多处理能力的需要。)
  2. 并行处理中的循环没有什么本质上的坏处。 (事实上,需要对每个块进行的计算越多,并行方法就越有可能提高性能。 )
  3. (如果您使用以下方法,则不适用)
  4. 使用Rcppdata.table代替

    使用C ++编译逻辑并使用data.table分组操作按组应用它可以从基线开始加速~2,000倍,远远超过您希望通过并行化获得的速度。

    在您的原始示例中, 39,420,000行,这将在我的计算机上以 1.883秒执行;在 28,800行的修订后的版本中,执行 0.004秒

    library(data.table)
    library(Rcpp)
    

    在R脚本中内联定义并编译C++函数CalcSW()

    一个注意事项:C / C++中的计数从0开始,与R不同,后者从1开始 - &#39 ;为什么这里的指数不同

    Rcpp::cppFunction('
    List CalcSW(NumericVector SW_ini,
                NumericVector SW_max,
                NumericVector rain,
                NumericVector swc,
                NumericVector PETc) {
    
      int n = SW_ini.length();
      NumericVector SW(n);
      NumericVector PAW(n);
      NumericVector aetc(n);
    
      double SW_ini_glob = SW_ini[0];
      double SW_max_glob = SW_max[0];
    
      SW[0] = SW_ini_glob;
      PAW[0] = SW[0] + rain[0];
    
      if (PAW[0] > swc[0]){
        aetc[0] = PETc[0];
      } else {
        aetc[0] = PAW[0]/swc[0]*PETc[0];
      }
    
      if (aetc[0] > PAW[0]){
        aetc[0] = PAW[0];
      }
    
      SW[0] = SW[0] + rain[0] - aetc[0];
    
      if(SW[0] > SW_max_glob){
        SW[0] = SW_max_glob;
      }
    
      if(SW[0] < 0){
        SW[0] = 0;
      }
    
      for (int i = 1; i < n; i++) {
    
        PAW[i] = SW[i-1] + rain[i];
    
        if (PAW[i] > swc[i]){
          aetc[i] = PETc[i];
        } else {
          aetc[i] = PAW[i]/swc[i]*PETc[i];
        }
    
        if (aetc[i] > PAW[i]){
          aetc[i] = PAW[i];
        }
    
        SW[i] = SW[i-1] + rain[i] - aetc[i];
    
        if(SW[i] > SW_max_glob){
          SW[i] = SW_max_glob;
        }
    
        if(SW[i] < 0){
         SW[i] = 0;
        }
      }
      return Rcpp::List::create(Rcpp::Named("SW") = SW,
                                Rcpp::Named("PAW") = PAW,
                                Rcpp::Named("aetc") = aetc);
    }')
    

    创建data.table

    df <- data.table(loc.id = rep(1:10, each = 80*36), 
                     year = rep(rep(1980:2015, each = 80), times = 10),
                     day = rep(rep(1:80, times = 36),times = 10),
                     rain = runif(10*36*80, min = 0 , max = 5),
                     swc = runif(10*36*80,min = 0, max = 50),
                     SW_max = rep(runif(10, min = 100, max = 200), each = 80*36),
                     SW_ini = runif(10*36*80),
                     PETc = runif(10*36*80, min = 0 , max = 1.3),
                     SW = as.numeric(NA),
                     PAW = as.numeric(NA), 
                     aetc = as.numeric(NA))
    
    setkey(df, loc.id, year, day)
    

    CalcSW()df的每个组合在loc.id执行函数year,同时将返回值分配给三列:

    system.time({
      df[,  c("SW","PAW","aetc") := CalcSW(SW_ini,
                                           SW_max,
                                           rain,
                                           swc,
                                           PETc), keyby = .(loc.id, year)]
    })
    

    ...

       user  system elapsed 
      0.004   0.000   0.004 
    

    结果:

    head(df)
    

    ...

       loc.id year day       rain       swc   SW_max     SW_ini      PETc       SW      PAW       aetc
    1:      1 1980   1 0.35813251 28.360715 177.3943 0.69116310 0.2870478 1.038675 1.049296 0.01062025
    2:      1 1980   2 1.10331116 37.013022 177.3943 0.02742273 0.4412420 2.125335 1.396808 0.01665171
    3:      1 1980   3 1.76680011 32.509970 177.3943 0.66273062 1.1071233 3.807561 2.483467 0.08457420
    4:      1 1980   4 3.20966558  8.252797 177.3943 0.12220454 0.3496968 6.840713 4.165693 0.17651342
    5:      1 1980   5 1.32498191 14.784203 177.3943 0.66381497 1.2168838 7.573160 7.198845 0.59253503
    6:      1 1980   6 0.02547458 47.903637 177.3943 0.21871598 1.0864713 7.418750 7.931292 0.17988449
    

    我并非100%肯定我完全实现了你的逻辑,但逻辑应该非常简单地调整我可能错过的东西,我以非常类似的方式实现它。 / p>

    另一个注意事项:通过自动缩进和代码突出显示(无论您是否使用RStudio或Emacs),您可以更轻松地编写C++您创建一个单独的文件,名称类似TestCode.cpp格式如下。

    然后,您可以使用Rcpp::sourceCpp("TestCode.cpp")在R脚本中编译函数,也可以将除前三行之外的所有内容复制并粘贴为Rcpp::cppFunction()的参数。就像我上面做的那样。

    #include <Rcpp.h>
    using namespace Rcpp;
    
    // [[Rcpp::export]]
    List CalcSW(NumericVector SW_ini,
                         NumericVector SW_max,
                         NumericVector rain,
                         NumericVector swc,
                         NumericVector PETc) {
    
      int n = SW_ini.length();
      NumericVector SW(n);
      NumericVector PAW(n);
      NumericVector aetc(n);
    
      double SW_ini_glob = SW_ini[0];
      double SW_max_glob = SW_max[0];
    
      SW[0] = SW_ini_glob;
      PAW[0] = SW[0] + rain[0];
    
      if (PAW[0] > swc[0]){
        aetc[0] = PETc[0];
      } else {
        aetc[0] = PAW[0]/swc[0]*PETc[0];
      }
    
      if (aetc[0] > PAW[0]){
        aetc[0] = PAW[0];
      }
    
      SW[0] = SW[0] + rain[0] - aetc[0];
    
      if(SW[0] > SW_max_glob){
        SW[0] = SW_max_glob;
      }
    
      if(SW[0] < 0){
        SW[0] = 0;
      }
    
      for (int i = 1; i < n; i++) {
    
        PAW[i] = SW[i-1] + rain[i];
    
        if (PAW[i] > swc[i]){
          aetc[i] = PETc[i];
        } else {
          aetc[i] = PAW[i]/swc[i]*PETc[i];
        }
    
        if (aetc[i] > PAW[i]){
          aetc[i] = PAW[i];
        }
    
        SW[i] = SW[i-1] + rain[i] - aetc[i];
    
        if(SW[i] > SW_max_glob){
          SW[i] = SW_max_glob;
        }
    
        if(SW[i] < 0){
          SW[i] = 0;
        }
      }
      return Rcpp::List::create(Rcpp::Named("SW") = SW,
                                Rcpp::Named("PAW") = PAW,
                                Rcpp::Named("aetc") = aetc);
    }
    

答案 1 :(得分:1)

此代码替换内循环

pclamp <- function(x, low, high)
    pmin(high, pmax(low, x))

fill2 <- function(rain, swc, PETc, SW0, SW.max) {

    SW <- PAW <- aetc <- matrix(0, nrow = nrow(rain), ncol = ncol(rain))

    for (day in seq_len(ncol(rain))) {
        PAW[, day] <- SW0 + rain[, day]

        aetc0 <- PETc[, day]
        idx <- PAW[, day] < swc[, day]
        aetc0[idx] <- (PAW[idx, day] / swc[idx, day]) * PETc[idx, day]
        aetc[, day] <- pmin(PAW[, day], aetc0)

        SW0 <- SW[, day] <- pclamp(PAW[, day] -  aetc[, day], 0, SW.max)
    }

    list(SW = SW, PAW = PAW, aetc = aetc)
}

并且比原始问题中的实现快约60倍。请注意,这是在C ++中采用的方法,即分配和更新新的向量,而不是data.frame的现有部分;这是性能差异的重要组成部分,并且可以在没有Rcpp的情况下获得好处。

这是一个泛化(非常轻的测试!)迭代location.year x day矩阵

days <- 80
rain <- matrix(df$rain, ncol=days, byrow=TRUE)
swc <- matrix(df$swc, ncol=days, byrow=TRUE)
PETc <- matrix(df$PETc, ncol=days, byrow=TRUE)
SW.ini <- df$SW.ini[df$day == 1]
SW.max <- df$SW.max[df$day == 1]

result <- fill2(rain, swc, PETc, SW.ini, SW.max)

来自原始输入,假设输入按年份,位置和日排序

fill1()

对于问题中的数据子集,它在per -location.date基础上比{{1}}快约15倍。对样本数据的操作大约需要10毫秒,对于完整数据大约需要10秒 - 比Matt的C ++解决方案慢5倍,但仍然比原始数据有了很大的改进,并采用了基本的R技术来改善代码许多不同的领域。