我有以下代码。我想对其进行优化以使其运行更快。
func1 = function(l, k, n,N,p){
y = c()
M = 1:N
for(j in 1:l){
for(i in 1:k){
r = sample(x = M,size = k,replace = FALSE)
sample_data = list(X = p$X[r],Y = p$Y[r])
y = append(y, sample_data$X[rank(sample_data$Y,ties.method = 'average') == i])
M = setdiff(M,r)
}
}
return(y)
}
func2 <- function(sample) {
data1 = sort(sample)
data2 = rank(data1, ties.method = 'average') / length(data1)
data3 = pnorm(data1, mean = 0, sd = 1)
data_diff = abs(data3 - data2)
max_data = max(data_diff)
return(max_data)
}
N = 20000
l = c(3,5)
k = c(3,5,10,15)
n = l*k
cov_Matrix <- matrix(c (1, 0.5,
0.5, 1),nrow=2,ncol=2,byrow=TRUE)
set.seed(100)
p0 <- as.data.frame(MASS::mvrnorm(N, mu = c(0,0),Sigma = cov_Matrix,empirical = FALSE))
names(p0) <- c('X','Y')
p0 = as.list(p0)
set.seed(NULL)
desired_matrix = matrix(rep(0,length(l)*length(k)),nrow = length(l))
#start_time <- Sys.time()
D = c()
for(q in 1:length(k)){
for(p in 1:length(l)){
desired_matrix[p,q] = {
for(s in 1:5000){
D = append(D,func2(func1(l[p],k[q],l[p]*k[q],N,p0)))
}
quantile(D,probs = 0.95)
}
D = c()
}
}
#end_time <- Sys.time()
#end_time - start_time
使用end_time - start_time
计算的原始代码所花费的时间为 1.2小时。
func1 <- function(l, k, n, N, p){
y <- vector("integer", l*k)
M <- 1:N
combi <- expand.grid(1:k, 1:l)
l_vector <- unlist(combi[2], use.names = F)
k_vector <- unlist(combi[1], use.names = F)
y_len <- length(y)
for (i in 1:y_len) {
r <- sample(x = M, size = k, replace = FALSE)
sample_data = list(X = p$X[r], Y = p$Y[r])
y[i] <- sample_data$X[rank(sample_data$Y, ties.method = 'average') == k_vector[i]]
M <- setdiff(M, r)
}
return (y)
}
func2 <- function(sample) {
data1 = sort(sample)
data2 = rank(data1, ties.method = 'average') / length(data1)
data3 = pnorm(data1, mean = 0, sd = 1)
data_diff = abs(data3 - data2)
max_data = max(data_diff)
return(max_data)
}
N = 20000
l = c(3,5)
k = c(3,5,10,15)
cov_Matrix <- matrix(c (1, 0.5,
0.5, 1),nrow=2,ncol=2,byrow=TRUE)
set.seed(100)
p0 <- as.data.frame(MASS::mvrnorm(N, mu = c(0,0),Sigma = cov_Matrix,empirical = FALSE))
names(p0) <- c('X','Y')
p0 = as.list(p0)
set.seed(NULL)
#start_time <- Sys.time()
combi <- expand.grid(k, l)
l_vector <- unlist(combi[2], use.names = F)
k_vector <- unlist(combi[1], use.names = F)
desired_matrix_as_vector <- vector("integer", length(l)*length(k))
for (i in 1:length(desired_matrix_as_vector)) {
l_val <- l_vector[i]
k_val <- k_vector[i]
D <- replicate(n = 5000, expr = func1(l_val, k_val, l_val*k_val, N, p0), simplify = FALSE)
D <- lapply(D, func2)
D <- unlist(D, use.names = FALSE)
desired_matrix_as_vector[i] <- quantile(D, probs = 0.95)
}
desired_matrix = matrix(desired_matrix_as_vector,nrow = length(l), byrow = TRUE)
#end_time <- Sys.time()
#end_time - start_time
使用end_time - start_time
计算的改进版本1 花费的时间为 35分钟。
我将M <- setdiff(M, r)
中的func1
更改为M <- M[!M %in% r]
。所以func1
看起来像这样
func1 <- function(l, k, n, N, p){
y <- vector("integer", l*k)
M <- 1:N
combi <- expand.grid(1:k, 1:l)
l_vector <- unlist(combi[2], use.names = F)
k_vector <- unlist(combi[1], use.names = F)
y_len <- length(y)
for (i in 1:y_len) {
r <- sample(x = M, size = k, replace = FALSE)
sample_data = list(X = p$X[r], Y = p$Y[r])
y[i] <- sample_data$X[rank(sample_data$Y, ties.method = 'average') == k_vector[i]]
M <- M[!M %in% r]
}
return (y)
}
使用end_time - start_time
计算的改进代码版本2 花费的时间为 15分钟。
> sessionInfo()
R version 3.4.4 (2018-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.1 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_IN.UTF-8 LC_NUMERIC=C LC_TIME=en_IN.UTF-8
[4] LC_COLLATE=en_IN.UTF-8 LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_IN.UTF-8
[7] LC_PAPER=en_IN.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] microbenchmark_1.4-4 forcats_0.3.0 stringr_1.3.1 dplyr_0.7.5
[5] purrr_0.2.5 readr_1.1.1 tidyr_0.8.1 tibble_1.4.2
[9] ggplot2_2.2.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] tidyselect_0.2.4 reshape2_1.4.3 haven_1.1.2 lattice_0.20-35 colorspace_1.3-2
[6] htmltools_0.3.6 yaml_2.1.19 rlang_0.2.1 pillar_1.2.3 foreign_0.8-69
[11] glue_1.2.0 modelr_0.1.2 readxl_1.1.0 bindrcpp_0.2.2 bindr_0.1.1
[16] plyr_1.8.4 munsell_0.5.0 gtable_0.2.0 cellranger_1.1.0 rvest_0.3.2
[21] psych_1.8.4 evaluate_0.10.1 knitr_1.20 parallel_3.4.4 broom_0.4.4
[26] Rcpp_0.12.17 scales_0.5.0 backports_1.1.2 jsonlite_1.5 mnormt_1.5-5
[31] hms_0.4.2 digest_0.6.15 stringi_1.2.3 grid_3.4.4 rprojroot_1.3-2
[36] cli_1.0.0 tools_3.4.4 magrittr_1.5 lazyeval_0.2.1 crayon_1.3.4
[41] pkgconfig_2.0.1 MASS_7.3-49 xml2_1.2.0 lubridate_1.7.4 assertthat_0.2.0
[46] rmarkdown_1.10 httr_1.3.1 rstudioapi_0.7 R6_2.2.2 nlme_3.1-131
[51] compiler_3.4.4
> Sys.info()
sysname release
"Linux" "4.15.0-34-generic"
version nodename
"#37-Ubuntu SMP Mon Aug 27 15:21:48 UTC 2018" "mysystem"
machine login
"x86_64" "unknown"
是否有更好的方法编写改进的版本或原始代码以加快速度?我希望限制自己尽可能地使用base R进行操作,尽管我愿意在需要时使用额外的软件包。
答案 0 :(得分:1)
重写了func1
:
func1 <- function(l, k, N, p, bins, k_v){
y <- vector("integer", l*k)
y_len <- l*k
ss <- sample.int(N)
Mlist <- split(ss, bins)
for (i in 1:y_len) {
r <- Mlist[[i]]
o <- rank(p$Y[r], ties.method = 'average')
ii <- o == k_v[i]
y[i] <- p$X[r][ii]
}
return(y)
}
并更改了主循环:
start_time <- Sys.time()
combi <- expand.grid(k, l)
l_vector <- unlist(combi[2], use.names = F)
k_vector <- unlist(combi[1], use.names = F)
desired_matrix_as_vector <- vector("integer", length(l)*length(k))
nn = 5000
for (i in 1:length(desired_matrix_as_vector)) {
l_val <- l_vector[i]
k_val <- k_vector[i]
combi2 <- expand.grid(1:k_val, 1:l_val)
k_vector2 <- unlist(combi2[1], use.names = F)
binss <- rep(1:ceiling(N / k_val), each = k_val)
binss <- binss[1:N]
binss <- as.factor(binss)
D <- replicate(n = nn, expr = func1(l_val, k_val, N, p0, binss, k_vector2),
simplify = FALSE)
D <- sapply(D, func2)
desired_matrix_as_vector[i] <- quantile(D, probs = 0.95)
}
desired_matrix = matrix(desired_matrix_as_vector,nrow = length(l), byrow = TRUE)
end_time <- Sys.time()
end_time - start_time
对于nn = 5000
,现在应该运行大约1分钟。
编辑:
使用
o <- .Internal(rank(p$Y[r], length(p$Y[r]), 'average'))
快30-40%(即35-40瑞典克朗)。
更新:
如果我们在matrix
周围使用污垢(不好的)骇客,我们可以省去10秒钟的时间:
func1 <- function(l, k, N, p, k_v){
y <- vector("integer", l*k)
ss <- sample.int(N)
m <- matrix(ss, k)
for (i in 1:(l*k)) {
r <- m[, i]
x <- p$Y[r]
o <- .Internal(rank(x, length(x), 'average'))
ii <- o == k_v[i]
y[i] <- p$X[r][ii]
}
return(y)
}
此外,由于我们不需要采样所有“ N”个元素,因此可以使用:
ss <- sample.int(N, (l*k*k))