我正在尝试有效地实现块引导技术来获得回归系数的分布。主要内容如下。
我有一个面板数据集,并说公司和年份是指数。对于bootstrap的每次迭代,我希望对n个主题进行替换。从这个示例中,我需要构建一个新的数据框,它是每个采样主题的所有观察值的rbind()
堆栈,运行回归并拉出系数。重复一堆迭代,比如说100。
我的想法最初是使用split()
命令按主题将现有数据框拆分为列表。从那里,使用
sample(unique(df1$subject),n,replace=TRUE)
获取新列表,然后从quickdf
包中实现plyr
以构建新数据框。
示例慢代码:
require(plm)
data("Grunfeld", package="plm")
firms = unique(Grunfeld$firm)
n = 10
iterations = 100
mybootresults=list()
for(j in 1:iterations){
v = sample(length(firms),n,replace=TRUE)
newdata = NULL
for(i in 1:n){
newdata = rbind(newdata,subset(Grunfeld, firm == v[i]))
}
reg1 = lm(value ~ inv + capital, data = newdata)
mybootresults[[j]] = coefficients(reg1)
}
mybootresults = as.data.frame(t(matrix(unlist(mybootresults),ncol=iterations)))
names(mybootresults) = names(reg1$coefficients)
mybootresults
(Intercept) inv capital
1 373.8591 6.981309 -0.9801547
2 370.6743 6.633642 -1.4526338
3 528.8436 6.960226 -1.1597901
4 331.6979 6.239426 -1.0349230
5 507.7339 8.924227 -2.8661479
...
...
答案 0 :(得分:14)
这样的事情怎么样:
myfit <- function(x, i) {
mydata <- do.call("rbind", lapply(i, function(n) subset(Grunfeld, firm==x[n])))
coefficients(lm(value ~ inv + capital, data = mydata))
}
firms <- unique(Grunfeld$firm)
b0 <- boot(firms, myfit, 999)
答案 1 :(得分:4)
您还可以使用boot
包中的tsboot函数和固定块重采样方案。
require(plm)
require(boot)
data(Grunfeld)
### each firm is of length 20
table(Grunfeld$firm)
## 1 2 3 4 5 6 7 8 9 10
## 20 20 20 20 20 20 20 20 20 20
blockboot <- function(data)
{
coefficients(lm(value ~ inv + capital, data = data))
}
### fixed length (every 20 obs, so for each different firm) block bootstrap
set.seed(321)
boot.1 <- tsboot(Grunfeld, blockboot, R = 99, l = 20, sim = "fixed")
boot.1
## Bootstrap Statistics :
## original bias std. error
## t1* 410.81557 -25.785972 174.3766
## t2* 5.75981 0.451810 2.0261
## t3* -0.61527 0.065322 0.6330
dim(boot.1$t)
## [1] 99 3
head(boot.1$t)
## [,1] [,2] [,3]
## [1,] 522.11 7.2342 -1.453204
## [2,] 626.88 4.6283 0.031324
## [3,] 479.74 3.2531 0.637298
## [4,] 557.79 4.5284 0.161462
## [5,] 568.72 5.4613 -0.875126
## [6,] 379.04 7.0707 -1.092860
答案 2 :(得分:1)
我发现使用library(boot) # for boot
library(plm) # for Grunfeld
library(dplyr) # for left_join
# First get the data
data("Grunfeld", package="plm")
myfit1 <- function(x, i) {
# x is the vector of firms
# i are the indexes into x
mydata <- do.call("rbind", lapply(i, function(n) subset(Grunfeld, firm==x[n])))
coefficients(lm(value ~ inv + capital, data = mydata))
}
myfit2 <- function(x, i) {
# x is the vector of firms
# i are the indexes into x
mydata <- left_join(data.frame(firm=x[i]), Grunfeld, by="firm")
coefficients(lm(value ~ inv + capital, data = mydata))
}
# rbind method
set.seed(1)
system.time(b1 <- boot(firms, myfit1, 5000))
## user system elapsed
## 13.51 0.01 13.62
# left_join method
set.seed(1)
system.time(b2 <- boot(firms, myfit2, 5000))
## user system elapsed
## 8.16 0.02 8.26
summary(b1)
## R original bootBias bootSE bootMed
## 1 5000 410.81557 14.78272 195.62461 413.70175
## 2 5000 5.75981 0.49301 2.42879 6.00692
## 3 5000 -0.61527 -0.13134 0.78854 -0.76452
summary(b2)
## R original bootBias bootSE bootMed
## 1 5000 410.81557 14.78272 195.62461 413.70175
## 2 5000 5.75981 0.49301 2.42879 6.00692
## 3 5000 -0.61527 -0.13134 0.78854 -0.76452
的方法更简洁,只需要大约60%的时间,并给出与Sean的答案相同的结果。这是一个完整的自包含示例。
{{1}}
答案 3 :(得分:1)
需要修改解决方案以管理固定效果。
library(boot) # for boot
library(plm) # for Grunfeld
library(dplyr) # for left_join
## Get the Grunfeld firm data (10 firms, each for 20 years, 1935-1954)
data("Grunfeld", package="plm")
## Create dataframe with unique firm identifier (one line per firm)
firms <- data.frame(firm=unique(Grunfeld$firm),junk=1)
## for boot(), X is the firms dataframe; i index the sampled firms
myfit <- function(X, i) {
## join the sampled firms to their firm-year data
mydata <- left_join(X[i,], Grunfeld, by="firm")
## Distinguish between multiple resamples of the same firm
## Otherwise they have the same id in the fixed effects regression
## And trouble ensues
mydata <- mutate(group_by(mydata,firm,year),
firm_uniq4boot = paste(firm,"+",row_number())
)
## Run regression with and without firm fixed effects
c(coefficients(lm(value ~ inv + capital, data = mydata)),
coefficients(lm(value ~ inv + capital + factor(firm_uniq4boot), data = mydata)))
}
set.seed(1)
system.time(b <- boot(firms, myfit, 1000))
summary(b)
summary(lm(value ~ inv + capital, data=Grunfeld))
summary(lm(value ~ inv + capital + factor(firm), data=Grunfeld))
答案 4 :(得分:0)
这是一种方法,通常应该比接受的答案更快,返回相同的结果,不依赖于其他包(boot
除外)。这里的关键是使用which
和整数索引来构建每个data.frame replicate而不是split/subset
和do.call/rbind
。
# get function for boot
myIndex <- function(x, i) {
# select the observations to subset. Likely repeated observations
blockObs <- unlist(lapply(i, function(n) which(x[n] == Grunfeld$firm)))
# run regression for given replicate, return estimated coefficients
coefficients(lm(value~ inv + capital, data=Grunfeld[blockObs,]))
}
现在,bootstrap
# get result
library(boot)
set.seed(1234)
b1 <- boot(firms, myIndex, 200)
运行已接受的答案
set.seed(1234)
b0 <- boot(firms, myfit, 200)
让我们进行比较
使用索引
b1
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = firms, statistic = myIndex, R = 200)
Bootstrap Statistics :
original bias std. error
t1* 410.8155650 -6.64885086 197.3147581
t2* 5.7598070 0.37922066 2.4966872
t3* -0.6152727 -0.04468225 0.8351341
原始版本
b0
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = firms, statistic = myfit, R = 200)
Bootstrap Statistics :
original bias std. error
t1* 410.8155650 -6.64885086 197.3147581
t2* 5.7598070 0.37922066 2.4966872
t3* -0.6152727 -0.04468225 0.8351341
这些看起来非常接近。现在,再多检查一下
identical(b0$t, b1$t)
[1] TRUE
和
identical(summary(b0), summary(b1))
[1] TRUE
最后,我们将做一个快速的基准测试
library(microbenchmark)
microbenchmark(index={b1 <- boot(firms, myIndex, 200)},
rbind={b0 <- boot(firms, myfit, 200)})
在我的电脑上,返回
Unit: milliseconds
expr min lq mean median uq max neval
index 292.5770 296.3426 303.5444 298.4836 301.1119 395.1866 100
rbind 712.1616 720.0428 729.6644 724.0777 731.0697 833.5759 100
因此,直接索引在分发的每个级别上都快2倍以上。
关于缺少固定效果的说明
与大多数答案一样,可能会出现缺少“固定效应”的问题。通常,固定效应用作对照,并且研究人员对每个选定观察中将包括的一个或几个变量感兴趣。在这种占优势的情况下,限制myIndex
或myfit
函数的返回结果只包含返回向量中感兴趣的变量没有(或很少)损害。