如何使用嵌套的for循环优化R代码以使其运行更快?

时间:2018-09-27 19:26:36

标签: r optimization

我有以下代码。我想对其进行优化以使其运行更快。

原始代码

func1 = function(l, k, n,N,p){

  y = c()
  M = 1:N

  for(j in 1:l){
    for(i in 1:k){
      r = sample(x = M,size = k,replace = FALSE)
      sample_data = list(X = p$X[r],Y = p$Y[r])
      y = append(y, sample_data$X[rank(sample_data$Y,ties.method = 'average') == i])
      M = setdiff(M,r)
    }
  }
  return(y)
}

func2 <- function(sample) {
  data1 = sort(sample)
  data2 = rank(data1, ties.method = 'average') / length(data1)
  data3 = pnorm(data1, mean = 0, sd = 1)
  data_diff = abs(data3 - data2)
  max_data = max(data_diff)
  return(max_data)
}

N = 20000
l = c(3,5)
k = c(3,5,10,15)
n = l*k

cov_Matrix <- matrix(c (1, 0.5,
                        0.5, 1),nrow=2,ncol=2,byrow=TRUE)

set.seed(100)
p0 <- as.data.frame(MASS::mvrnorm(N, mu = c(0,0),Sigma = cov_Matrix,empirical = FALSE))
names(p0) <- c('X','Y')
p0 = as.list(p0)
set.seed(NULL)

desired_matrix = matrix(rep(0,length(l)*length(k)),nrow = length(l))

#start_time <- Sys.time()

D = c()
for(q in 1:length(k)){
  for(p in 1:length(l)){
    desired_matrix[p,q] = {
      for(s in 1:5000){
        D = append(D,func2(func1(l[p],k[q],l[p]*k[q],N,p0)))
      }
      quantile(D,probs = 0.95)
    }
    D = c()
  }
}

#end_time <- Sys.time()
#end_time - start_time

使用end_time - start_time计算的原始代码所花费的时间为 1.2小时

改进的代码版本1

func1 <- function(l, k, n, N, p){
  y <- vector("integer", l*k)
  M <- 1:N

  combi <- expand.grid(1:k, 1:l)
  l_vector <- unlist(combi[2], use.names = F)
  k_vector <- unlist(combi[1], use.names = F)

  y_len <- length(y)

  for (i in 1:y_len) {
    r <- sample(x = M, size = k, replace = FALSE)
    sample_data = list(X = p$X[r], Y = p$Y[r])
    y[i] <- sample_data$X[rank(sample_data$Y, ties.method = 'average') == k_vector[i]]
    M <- setdiff(M, r)
  }
  return (y)
}

func2 <- function(sample) {
  data1 = sort(sample)
  data2 = rank(data1, ties.method = 'average') / length(data1)
  data3 = pnorm(data1, mean = 0, sd = 1)
  data_diff = abs(data3 - data2)
  max_data = max(data_diff)
  return(max_data)
}

N = 20000
l = c(3,5)
k = c(3,5,10,15)

cov_Matrix <- matrix(c (1, 0.5,
                        0.5, 1),nrow=2,ncol=2,byrow=TRUE)

set.seed(100)
p0 <- as.data.frame(MASS::mvrnorm(N, mu = c(0,0),Sigma = cov_Matrix,empirical = FALSE))
names(p0) <- c('X','Y')
p0 = as.list(p0)
set.seed(NULL)

#start_time <- Sys.time()

combi <- expand.grid(k, l)
l_vector <- unlist(combi[2], use.names = F)
k_vector <- unlist(combi[1], use.names = F)

desired_matrix_as_vector <- vector("integer", length(l)*length(k))
for (i in 1:length(desired_matrix_as_vector)) {
  l_val <- l_vector[i]
  k_val <- k_vector[i]

  D <- replicate(n = 5000, expr = func1(l_val, k_val, l_val*k_val, N, p0), simplify = FALSE)
  D <- lapply(D, func2)
  D <- unlist(D, use.names = FALSE)
  desired_matrix_as_vector[i] <- quantile(D, probs = 0.95)
}

desired_matrix = matrix(desired_matrix_as_vector,nrow = length(l), byrow = TRUE)

#end_time <- Sys.time()
#end_time - start_time

使用end_time - start_time计算的改进版本1 花费的时间为 35分钟

改进的代码版本2

我将M <- setdiff(M, r)中的func1更改为M <- M[!M %in% r]。所以func1看起来像这样

func1 <- function(l, k, n, N, p){
  y <- vector("integer", l*k)
  M <- 1:N

  combi <- expand.grid(1:k, 1:l)
  l_vector <- unlist(combi[2], use.names = F)
  k_vector <- unlist(combi[1], use.names = F)

  y_len <- length(y)

  for (i in 1:y_len) {
    r <- sample(x = M, size = k, replace = FALSE)
    sample_data = list(X = p$X[r], Y = p$Y[r])
    y[i] <- sample_data$X[rank(sample_data$Y, ties.method = 'average') == k_vector[i]]
    M <- M[!M %in% r]
  }
  return (y)
}

使用end_time - start_time计算的改进代码版本2 花费的时间为 15分钟

会话信息:

> sessionInfo()
R version 3.4.4 (2018-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.1 LTS

Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1

locale:
 [1] LC_CTYPE=en_IN.UTF-8       LC_NUMERIC=C               LC_TIME=en_IN.UTF-8       
 [4] LC_COLLATE=en_IN.UTF-8     LC_MONETARY=en_IN.UTF-8    LC_MESSAGES=en_IN.UTF-8   
 [7] LC_PAPER=en_IN.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
[10] LC_TELEPHONE=C             LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] microbenchmark_1.4-4 forcats_0.3.0        stringr_1.3.1        dplyr_0.7.5         
 [5] purrr_0.2.5          readr_1.1.1          tidyr_0.8.1          tibble_1.4.2        
 [9] ggplot2_2.2.1        tidyverse_1.2.1     

loaded via a namespace (and not attached):
 [1] tidyselect_0.2.4 reshape2_1.4.3   haven_1.1.2      lattice_0.20-35  colorspace_1.3-2
 [6] htmltools_0.3.6  yaml_2.1.19      rlang_0.2.1      pillar_1.2.3     foreign_0.8-69  
[11] glue_1.2.0       modelr_0.1.2     readxl_1.1.0     bindrcpp_0.2.2   bindr_0.1.1     
[16] plyr_1.8.4       munsell_0.5.0    gtable_0.2.0     cellranger_1.1.0 rvest_0.3.2     
[21] psych_1.8.4      evaluate_0.10.1  knitr_1.20       parallel_3.4.4   broom_0.4.4     
[26] Rcpp_0.12.17     scales_0.5.0     backports_1.1.2  jsonlite_1.5     mnormt_1.5-5    
[31] hms_0.4.2        digest_0.6.15    stringi_1.2.3    grid_3.4.4       rprojroot_1.3-2 
[36] cli_1.0.0        tools_3.4.4      magrittr_1.5     lazyeval_0.2.1   crayon_1.3.4    
[41] pkgconfig_2.0.1  MASS_7.3-49      xml2_1.2.0       lubridate_1.7.4  assertthat_0.2.0
[46] rmarkdown_1.10   httr_1.3.1       rstudioapi_0.7   R6_2.2.2         nlme_3.1-131    
[51] compiler_3.4.4  

系统信息:

> Sys.info()
                                      sysname                                       release 
                                      "Linux"                           "4.15.0-34-generic" 
                                      version                                      nodename 
"#37-Ubuntu SMP Mon Aug 27 15:21:48 UTC 2018"                                   "mysystem" 
                                      machine                                         login 
                                     "x86_64"                                     "unknown" 

是否有更好的方法编写改进的版本或原始代码以加快速度?我希望限制自己尽可能地使用base R进行操作,尽管我愿意在需要时使用额外的软件包。

1 个答案:

答案 0 :(得分:1)

重写了func1

func1 <- function(l, k, N, p, bins, k_v){
  y <- vector("integer", l*k)
  y_len <- l*k
  ss <- sample.int(N)
  Mlist <- split(ss, bins)
  for (i in 1:y_len) {
    r <- Mlist[[i]]
    o <- rank(p$Y[r], ties.method = 'average')
    ii <- o == k_v[i]
    y[i] <- p$X[r][ii]
  }
  return(y)
}

并更改了主循环:

start_time <- Sys.time()

combi <- expand.grid(k, l)
l_vector <- unlist(combi[2], use.names = F)
k_vector <- unlist(combi[1], use.names = F)

desired_matrix_as_vector <- vector("integer", length(l)*length(k))
nn = 5000
for (i in 1:length(desired_matrix_as_vector)) {
  l_val <- l_vector[i]
  k_val <- k_vector[i]

  combi2 <- expand.grid(1:k_val, 1:l_val)
  k_vector2 <- unlist(combi2[1], use.names = F)

  binss  <- rep(1:ceiling(N / k_val), each = k_val)
  binss <- binss[1:N]
  binss <- as.factor(binss)

  D <- replicate(n = nn, expr = func1(l_val, k_val, N, p0, binss, k_vector2),
                 simplify = FALSE)
  D <- sapply(D, func2)
  desired_matrix_as_vector[i] <- quantile(D, probs = 0.95)
}

desired_matrix = matrix(desired_matrix_as_vector,nrow = length(l), byrow = TRUE)

end_time <- Sys.time()
end_time - start_time

对于nn = 5000,现在应该运行大约1分钟。

编辑:

使用

o <- .Internal(rank(p$Y[r], length(p$Y[r]), 'average'))

快30-40%(即35-40瑞典克朗)。

更新:

如果我们在matrix周围使用污垢(不好的)骇客,我们可以省去10秒钟的时间:

func1 <- function(l, k, N, p, k_v){
  y <- vector("integer", l*k)
  ss <- sample.int(N)
  m <- matrix(ss, k)
  for (i in 1:(l*k)) {
    r <- m[, i]
    x <- p$Y[r]
    o <- .Internal(rank(x, length(x), 'average'))
    ii <- o == k_v[i]
    y[i] <- p$X[r][ii]
  }
  return(y)
}

此外,由于我们不需要采样所有“ N”个元素,因此可以使用:

ss <- sample.int(N, (l*k*k))