我尝试重现Stata在运行具有群集标准错误的模型时产生的95%CI。例如:
regress api00 acs_k3 acs_46 full enroll, cluster(dnum)
Regression with robust standard errors Number of obs = 395
F( 4, 36) = 31.18
Prob > F = 0.0000
R-squared = 0.3849
Number of clusters (dnum) = 37 Root MSE = 112.20
------------------------------------------------------------------------------
| Robust
api00 | Coef. Std. Err. t P>|t| [95% Conf. Interval]
---------+--------------------------------------------------------------------
acs_k3 | 6.954381 6.901117 1.008 0.320 -7.041734 20.9505
acs_46 | 5.966015 2.531075 2.357 0.024 .8327565 11.09927
full | 4.668221 .7034641 6.636 0.000 3.24153 6.094913
enroll | -.1059909 .0429478 -2.468 0.018 -.1930931 -.0188888
_cons | -5.200407 121.7856 -0.043 0.966 -252.193 241.7922
------------------------------------------------------------------------------
我能够重现系数和标准误差:
library(readstata13)
library(texreg)
library(sandwich)
library(lmtest)
clustered.se <- function(model_result, data, cluster) {
model_variables <-
intersect(colnames(data), c(colnames(model_result$model), cluster))
model_rows <- rownames(model_result$model)
data <- data[model_rows, model_variables]
cl <- data[[cluster]]
M <- length(unique(cl))
N <- nrow(data)
K <- model_result$rank
dfc <- (M / (M - 1)) * ((N - 1) / (N - K))
uj <-
apply(estfun(model_result), 2, function(x)
tapply(x, cl, sum))
vcovCL <- dfc * sandwich(model_result, meat = crossprod(uj) / N)
standard.errors <- coeftest(model_result, vcov. = vcovCL)[, 2]
p.values <- coeftest(model_result, vcov. = vcovCL)[, 4]
clustered.se <-
list(vcovCL = vcovCL,
standard.errors = standard.errors,
p.values = p.values)
return(clustered.se)
}
elemapi2 <- read.dta13(file = 'elemapi2.dta')
lm1 <-
lm(formula = api00 ~ acs_k3 + acs_46 + full + enroll,
data = elemapi2)
clustered_se <-
clustered.se(model_result = lm1,
data = elemapi2,
cluster = "dnum")
htmlreg(
lm1,
override.se = clustered_se$standard.errors,
override.p = clustered_se$p.value,
star.symbol = "\\*",
digits = 7
)
=============================
Model 1
-----------------------------
(Intercept) -5.2004067
(121.7855938)
acs_k3 6.9543811
(6.9011174)
acs_46 5.9660147 *
(2.5310751)
full 4.6682211 ***
(0.7034641)
enroll -0.1059909 *
(0.0429478)
-----------------------------
R^2 0.3848830
Adj. R^2 0.3785741
Num. obs. 395
RMSE 112.1983218
=============================
*** p < 0.001, ** p < 0.01, * p < 0.05
唉,我无法重现95%的置信区间:
screenreg(
lm1,
override.se = clustered_se$standard.errors,
override.p = clustered_se$p.value,
digits = 7,
ci.force = TRUE
)
========================================
Model 1
----------------------------------------
(Intercept) -5.2004067
[-243.8957845; 233.4949710]
acs_k3 6.9543811
[ -6.5715605; 20.4803228]
acs_46 5.9660147 *
[ 1.0051987; 10.9268307]
full 4.6682211 *
[ 3.2894567; 6.0469855]
enroll -0.1059909 *
[ -0.1901670; -0.0218148]
----------------------------------------
R^2 0.3848830
Adj. R^2 0.3785741
Num. obs. 395
RMSE 112.1983218
========================================
* 0 outside the confidence interval
如果我手工制作&#39;,我会得到与texreg
相同的内容:
level <- 0.95
a <- 1-(1 - level)/2
coeff <- lm1$coefficients
se <- clustered_se$standard.errors
lb <- coeff - qnorm(a)*se
ub <- coeff + qnorm(a)*se
> lb
(Intercept) acs_k3 acs_46 full enroll
-243.895784 -6.571560 1.005199 3.289457 -0.190167
> ub
(Intercept) acs_k3 acs_46 full enroll
233.49497100 20.48032276 10.92683074 6.04698550 -0.02181481
Stata在做什么以及如何在R中重现它?
PS:这是follow up question。 PS2:Stata数据可用here。
答案 0 :(得分:1)
看起来Stata使用基于t(36)而不是Z(即正常误差)的置信区间。
从Stata输出中获取值
coef=6.954381; rse= 6.901117 ; lwr= -7.041734; upr= 20.9505
(upr-coef)/rse
## [1] 2.028095
(lwr-coef)/rse
## [1] -2.028094
计算/交叉检查t(36)的尾值:
pt(2.028094,36)
## [1] 0.975
qt(0.975,36)
## [1] 2.028094
我不知道你如何将置信区间传递给texreg。既然你没有给出一个可重复的例子(我没有elemapi2.dta
),我不能确切地说你将如何获得df,但看起来你想要tdf <- length(unique(elemapi2$dnum))-1
level <- 0.95
a <- 1- (1 - level)/2
bounds <- coef(lm1) + c(-1,1)*clustered_se*qt(a,tdf)
答案 1 :(得分:0)
事实上,Stata正在使用t分布而不是正态分布。现在有一个非常简单的解决方案,可以使用estimatr
package中的texreg
获取与Stata匹配的置信区间lm_robust
,您可以从CRAN install.packages(estimatr)
安装。{/ p>
> library(estimatr)
> lmro <- lm_robust(mpg ~ hp, data = mtcars, clusters = cyl, se_type = "stata")
> screenreg(lmro)
===========================
Model 1
---------------------------
(Intercept) 30.10 *
[13.48; 46.72]
hp -0.07
[-0.15; 0.01]
---------------------------
R^2 0.60
Adj. R^2 0.59
Num. obs. 32
RMSE 3.86
===========================
* 0 outside the confidence interval