我正在尝试针对3种不同的效果大小运行经过偏差校正的百分位数引导程序,并遇到了前辈的一些代码。尽管此代码起作用,但运行时间花费的时间太长。 3种不同效果大小中的每一种大约需要3天。
它还显示了我不需要的1000个迭代中的每个迭代。理想情况下,我想最大程度地优化运行时间。我还希望控制台上的所有3个输出或一个word / excel文档,而不需要一次运行它们。
我知道代码有点笨拙,但是会有所帮助。
我对R相当陌生,并一直在寻找有关如何适当进行操作的建议。代码如下:
library(pscl)
library(boot)
# RNG MODULE FOR TWO_PART HURDLE MODEL
gen.hurdle = function(n, a, b1, b2, c1, c2, i0, i1, i2){
x = round(rnorm(n),3)
e = rnorm(n)
m = round(i0 + a*x + e, 3)
lambda = exp(i1 + b1*m + c1*x) # PUT REGRESSION TERMS FOR THE CONTINUUM PART HERE; KEEP exp()
ystar = qpois(runif(n, dpois(0, lambda), 1), lambda) # Zero-TRUNCATED POISSON DIST.; THE CONTINUUM PART
z = i2 + b2*m + c2*x # PUT REGRESSION TERMS FOR THE BINARY PART HERE
z_p = exp(z) / (1+exp(z)) # p(1) = 1-p(0)
tstar = rbinom(n, 1, z_p) # BINOMIAL DIST. ; THE BINARY PART
y= ystar*tstar # TWO-PART COUNT OUTCOME
return(cbind(x,m,y,z,z_p,tstar))
}
##################################################################################################
# MODEL ##########################################################################################
##################################################################################################
# i=1 ###################################
#small effect
seed=51; n=50 ;a=.18; b=.16; c=.25; i=1
seed=53; n=100 ;a=.18; b=.16; c=.25; i=1
seed=55; n=200 ;a=.18; b=.16; c=.25; i=1
seed=57; n=300 ;a=.18; b=.16; c=.25; i=1
seed=58; n=500 ;a=.18; b=.16; c=.25; i=1
seed=59; n=1000;a=.18; b=.16; c=.25; i=1
#medium effect
seed=61; n=50 ;a=.31; b=.35; c=.25; i=1
seed=63; n=100 ;a=.31; b=.35; c=.25; i=1
seed=65; n=200 ;a=.31; b=.35; c=.25; i=1
seed=67; n=300 ;a=.31; b=.35; c=.25; i=1
seed=68; n=500 ;a=.31; b=.35; c=.25; i=1
seed=69; n=1000;a=.31; b=.35; c=.25; i=1
#large effect
seed=81; n=50 ;a=.52; b=.49; c=.25; i=1
seed=73; n=100 ;a=.52; b=.49; c=.25; i=1
seed=75; n=200 ;a=.52; b=.49; c=.25; i=1
seed=77; n=300 ;a=.52; b=.49; c=.25; i=1
seed=78; n=500 ;a=.52; b=.49; c=.25; i=1
seed=79; n=1000;a=.52; b=.49; c=.25; i=1
#model
set.seed(seed)
iterations = 1000
r = 1000
results = matrix(,iterations,4)
for (iiii in 1:iterations){
data = data.frame(gen.hurdle(n, a, b, b, c, c, i, i, i))
data0 = data.frame(gen.hurdle(n, a, 0, 0, c, c, i, i, i))
p_0 =1-mean(data$z_p)
p_0_hat =1-mean(data$tstar)
p_0_b0 =1-mean(data0$z_p)
p_0_hat_b0 =1-mean(data0$tstar)
# power
boot= matrix(,r,8)
for(jjjj in 1:r){
#power
fit1 = lm(m ~ x, data=data)
fit2 = hurdle(formula = y ~ m + x, data=data, dist = "poisson", zero.dist = "binomial")
a_hat = summary(fit1)$coef[2,1]
b1_hat = summary(fit2)[[1]]$count[2,1]
b2_hat = summary(fit2)[[1]]$zero[2,1]
ab1_hat = prod(a_hat,b1_hat)
ab2_hat = prod(a_hat,b2_hat)
boot.data = data[sample(nrow(data), replace = TRUE), ]
boot.data$y[1] = if(prod(boot.data$y) > 0) 0 else boot.data$y[1]
boot.fit1 = lm(m ~ x, data=boot.data)
boot.fit2 = hurdle(formula = y ~ m + x, data=boot.data, dist = "poisson", zero.dist = "binomial")
boot.a = summary(boot.fit1)$coef[2,1]
boot.b1 = summary(boot.fit2)[[1]]$count[2,1]
boot.b2 = summary(boot.fit2)[[1]]$zero[2,1]
boot.ab1 = prod(boot.a,boot.b1)
boot.ab2 = prod(boot.a,boot.b2)
#type I error
fit3 = lm(m ~ x, data=data0)
fit4 = hurdle(formula = y ~ m + x, data=data0, dist = "poisson", zero.dist = "binomial")
a_hat_b0 = summary(fit3)$coef[2,1]
b1_hat_b0 = summary(fit4)[[1]]$count[2,1]
b2_hat_b0 = summary(fit4)[[1]]$zero[2,1]
ab1_hat_b0 = prod(a_hat_b0,b1_hat_b0)
ab2_hat_b0 = prod(a_hat_b0,b2_hat_b0)
boot.data0 = data0[sample(nrow(data0), replace = TRUE), ]
boot.data0$y[1] = if(prod(boot.data0$y) > 0) 0 else boot.data0$y[1]
boot.fit3 = lm(m ~ x, data=boot.data0)
boot.fit4 = hurdle(formula = y ~ m + x, data=boot.data0, dist = "poisson", zero.dist = "binomial")
boot.a_b0 = summary(boot.fit3)$coef[2,1]
boot.b1_b0 = summary(boot.fit4)[[1]]$count[2,1]
boot.b2_b0 = summary(boot.fit4)[[1]]$zero[2,1]
boot.ab1_b0 = prod(boot.a_b0,boot.b1_b0)
boot.ab2_b0 = prod(boot.a_b0,boot.b2_b0)
boot[jjjj,] = c(ab1_hat, ab2_hat, boot.ab1, boot.ab2,
ab1_hat_b0, ab2_hat_b0, boot.ab1_b0, boot.ab2_b0)
}
z0.1 = qnorm((sum(boot[,3] > boot[,1])+sum(boot[,3]==boot[,1])/2)/r)
z0.2 = qnorm((sum(boot[,4] > boot[,2])+sum(boot[,4]==boot[,2])/2)/r)
z0.1_b0 = qnorm((sum(boot[,7] > boot[,5])+sum(boot[,7]==boot[,5])/2)/r)
z0.2_b0 = qnorm((sum(boot[,8] > boot[,6])+sum(boot[,8]==boot[,6])/2)/r)
alpha=0.05 # 95% limits
z=qnorm(c(alpha/2,1-alpha/2)) # Std. norm. limits
p1 = pnorm(z-2*z0.1) # bias-correct & convert to proportions
p2 = pnorm(z-2*z0.2)
p1_b0 = pnorm(z-2*z0.1_b0)
p2_b0 = pnorm(z-2*z0.2_b0)
ci1 = quantile(boot[,3],p=p1) # Bias-corrected percentile lims
ci2 = quantile(boot[,4],p=p2)
ci1_b0 = quantile(boot[,7],p=p1_b0)
ci2_b0 = quantile(boot[,8],p=p2_b0)
sig.ab1 = if(prod(ci1) > 0) 1 else 0
sig.ab2 = if(prod(ci2) > 0) 1 else 0
sig.ab1_b0 = if(prod(ci1_b0) > 0) 1 else 0
sig.ab2_b0 = if(prod(ci2_b0) > 0) 1 else 0
#results
results[iiii,] = c(sig.ab1, sig.ab2, sig.ab1_b0, sig.ab2_b0)
message(paste0(iiii, " / iterations"))
flush.console()
}
i
n
a
b
iterations
#bootstrap how many
r
#power of ab1
mean(results[,1])
#power of ab2
mean(results[,2])
#type I error of ab1
mean(results[,3])
#type I error of ab2
mean(results[,4])
在我看来,分别运行每种效果的问题来自命名每个循环的结果“结果”。我不知道在不影响循环的情况下打印所有结果(针对每种效果大小)的最佳方法。
长度显然来自迭代的数量以及缺少足够快地处理它们的RAM。这东西甚至可以补救吗?运行程序时,没有获得所有效果大小的结果所花费的时间几乎没有那么麻烦。
答案 0 :(得分:1)
要部分给您一个答案,请按以下步骤引导功能。
考虑此数据,
summary(beaver2[c("time", "activ")]); nrow(beaver2)
# time activ
# Min. : 0 Min. :0.00
# 1st Qu.:1128 1st Qu.:0.00
# Median :1535 Median :1.00
# Mean :1446 Mean :0.62
# 3rd Qu.:1942 3rd Qu.:1.00
# Max. :2350 Max. :1.00
# [1] 100
和这个回归:
summary(with(beaver2, lm(temp ~ activ)))$coe
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 37.0968421 0.03456240 1073.32955 2.895092e-201
# activ 0.8062224 0.04389429 18.36736 1.682051e-33
使用for
循环,我们将像这样自举:
set.seed(528)
for.res <- rep(NA, 5e2)
for (i in 1:5e2) {
X <- beaver2[sample(seq(nrow(beaver2)), replace=TRUE), ]
y0 <- with(X, mean(temp[activ == 0]))
y1 <- with(X, mean(temp[activ == 1]))
for.res[i] <- y1 - y0
}
cat("beta:", b <- mean(for.res), "\tse:", s <- sd(for.res), "\tt:", t <- b/s,
"\tp:", 2 * pt(-abs(t), df = nrow(beaver2) - 1))
# beta: 0.8041428 se: 0.04089321 t: 19.66446 p: 5.761707e-36
使用一个函数,我们将像这样自举:
bootFun <- function(X) {
y0 <- with(X, mean(temp[activ == 0]))
y1 <- with(X, mean(temp[activ == 1]))
return(yh1 - yh0)
}
set.seed(528)
boot.res <- replicate(5e2, bootFun(X=beaver2[sample(seq(nrow(beaver2)),
replace=TRUE), ]))
cat("beta:", b <- mean(boot.res), "\tse:", s <- sd(boot.res), "\tt:", t <- b/s,
"\tp:", 2 * pt(-abs(t), df = nrow(beaver2) - 1))
# beta: 0.8049935 s: 0.04255446 t: 18.91678 p: 1.203037e-34
在上面的示例中,replicate
与for
相比的速度提高了大约10%:
microbenchmark::microbenchmark(
replicate=replicate(5e2, bootFun(beaver2[sample(seq(nrow(beaver2)),
replace=TRUE), ])),
for.loop={
for (i in 1:5e2) {
X <- beaver2[sample(seq(nrow(beaver2)), replace=TRUE), ]
y0 <- with(X, mean(temp[activ == 0]))
y1 <- with(X, mean(temp[activ == 1]))
for.res[i] <- y1 - y0
}
})
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# replicate 88.37455 89.12988 93.36896 89.66325 99.8587 133.1689 100 a
# for.loop 95.88837 96.65481 102.40125 97.07489 107.5985 295.3379 100 b