我正在努力让我的代码更快。目前我正在使用doParallel,但我想知道它是否可以通过巧妙的编程而不是更快的硬件更快。这是我想要做的程式化版本:
library(dplyr)
library(doParallel)
library(data.table)
cl <- makeCluster(detectCores(all.tests=FALSE,logical=TRUE))
registerDoParallel(cl)
set.seed(12345)
crit <- 0.5
dta <- data.frame(treat = sample(1:12,1000, replace=TRUE),
dep = sample(100:200,1000, replace=TRUE),
uniqID = rep(1:100,length.out = 1000))
nr_repl <- 1000
oper <- foreach (repl = 1:nr_repl, .combine=cbind,.packages = c("data.table")) %dopar% {
dta_sim <- data.table(dta)
setDT(dta_sim)[,perm:=sample(treat),by = (uniqID)]
dta_sim$recipient <- "single"
dta_sim$recipient[dta_sim$perm == 5 |dta_sim$perm == 6 |dta_sim$perm == 7 |dta_sim$perm == 12 ] <- "couple"
return(abs(summary(lm(dep~recipient=='couple', data=dta_sim))$coefficients[2,1]) > abs(crit) )
}
mean(oper)
在有用的评论和建议之后,我最终得到了这些:
library(dplyr)
library(doParallel)
library(data.table)
cl <- makeCluster(detectCores(all.tests=FALSE,logical=TRUE))
registerDoParallel(cl)
set.seed(12345)
crit <- .5
dta <- data.frame(treat = sample(1:12,1000, replace=TRUE),
dep = sample(100:200,1000, replace=TRUE),
uniqID = rep(1:100,length.out = 1000))
nr_repl <- 1000
oper <- foreach (repl = 1:nr_repl, .combine=cbind,.packages = c("data.table")) %dopar% {
dta_sim <- data.table(dta)
setDT(dta_sim)[,perm:=sample(treat),by = (uniqID)]
dta_sim$recipient <- ifelse(dta_sim$perm %in% c(5,6,7,12), "couple", "single")
return(abs(coef(lm(dep~recipient=='couple', data=dta_sim))[2]) > abs(crit) )
}
mean(oper)
答案 0 :(得分:0)
要优化的一件事是将names(table(dta$uniqID))
调用中的sapply
替换为as.character(unique(dta$uniqID))
。根据您的数据大小,这将快几倍。
使用长度为1,000,000的向量进行基准测试:
x <- rep(1:100, 10000)
> test <- microbenchmark(names(table(x)), as.character(unique(x)), times = 100, unit = "s")
> test
Unit: seconds
expr min lq mean median uq max neval cld
names(table(x)) 0.22017963 0.222114845 0.27556537 0.226181405 0.328574578 0.4403513 100 b
as.character(unique(x)) 0.00714553 0.007643307 0.02098216 0.008296107 0.009429619 0.1173899 100 a
比较中值,as.character(unique(x))
执行速度比names(table(x))
快28倍。你的其余代码对我来说似乎非常精简。