我正在尝试模拟用于测试多级贝叶斯逻辑回归模型的多级数据(随机拦截)。这是我到目前为止所做的:
n.country <- 42 #number of units
obs <- rbinom(n.country, 10, .8) #number of obs. per unit
n.obs <- sum(obs) #overall number of obs
country <- rep(1:n.country, obs)
e.country <- rnorm(n.country,0, 3) #country-specific error
e.obs <- rnorm(n.obs, 0, 5) #observation-specific error
# Continuous independent variables
set.seed(666)
X1 = rnorm(n.obs)
X2 = rnorm(n.obs)
X3 = rnorm(n.obs)
X4 = rnorm(n.obs)
X5 = rnorm(n.obs)
X6 = rnorm(n.obs)
X7 = rnorm(n.obs)
X8 = rnorm(n.obs)
#Dependent variable
L <- 2*X1 + 3*X2 + 1.75*X3 + 2.5*X4 + 1.5*X5 +
3.5*X6 + 0.5*X7 + 1.75*X8 + e.country[country] + e.obs
## prob. of the dependent variable described by a linear
## combination of covariates and country-specific intercepts
pr = 1/(1+exp(-L))# pass through an inv-logit function
Y = rbinom(n.obs,1,pr) #DV described by its probability
#Set up data frame
Test = data.frame(y=Y,x1=X1,x2=X2,x3=X3,x4=X4,x5=X5,x6=X6,
x7=X7,x8=X8,country=country)
但是,在将估算器应用于它时,它产生的系数与我设置的系数不同,glmer()
包的lme4
函数也是如此。
我对随机和固定效果都很感兴趣。我的贝叶斯估计器看起来像这样:
writeLines("
model{
for (i in 1:N){
y[i] ~ dbin(p.bound[i], 1)
p.bound[i] <- max(0, min(1, p[i]))
logit(p[i]) <- Xbeta[i] #likelihood
Xbeta[i] <- v[country[i]] + beta[1]*x1[i] + beta[2]*x2[i] + beta[3]*x3[i] + beta[4]*x4[i] + beta[5]*x5[i] + beta[6]*x6[i] + beta[7]*x7[i] + beta[8]*x8[i]
} #linear combination
for (j in 1:n){
v[j] ~ dnorm(beta0, prec.tau2) #country-specific intercept
}
beta[1] ~ dmnorm(0,0.001) #IV priors
beta[2] ~ dmnorm(0,0.001)
beta[3] ~ dmnorm(0,0.001)
beta[4] ~ dmnorm(0,0.001)
beta[5] ~ dmnorm(0,0.001)
beta[6] ~ dmnorm(0,0.001)
beta[7] ~ dmnorm(0,0.001)
beta[8] ~ dmnorm(0,0.001)
beta0 ~ dnorm(0,0.001)
prec.tau2 ~dgamma(0.001,0.001) #hyperparameter priors
tau <- sqrt(1/prec.tau2)
}", con="Multi.logit.jag")
y <- Vars$fix
x1 <- Vars$h_polcon3
x2 <- Vars$imf_infl
x3 <- Vars$imf_ue
x4 <- Vars$imf_gd
x5 <- Vars$wdi_trade
x6 <- Vars$dr_eg
x7 <- Vars$wdi_gdppcgr
x8 <- Vars$dpi_erlc
N <- 607
country <- Vars$cname #specify levels
country <- as.factor(country)
nlevels(country) #Sample consists of 42 countries
country <- as.numeric(country) #transfer factor variabel to numeric variable in order to make model run
n <- 42
M.data.multi <-list('N'=N, 'n'= n, 'y'=y, 'country' = country, 'x1'=x1, 'x2'=x2, 'x3'=x3, 'x4'=x4, 'x5'=x5, 'x6'=x6, 'x7'=x7, 'x8'=x8)
library(coda)
library(rjags)
library(R2jags)
M.sim.multi <- jags(data=M.data.multi, parameters.to.save="beta", model.file="Multi.logit.jag", n.chains=3, n.iter=2000, n.burnin=500)
print(M.sim.multi)
和收益
Inference for Bugs model at "Multi.logit.jag", fit using jags,
3 chains, each with 2000 iterations (first 500 discarded)
n.sims = 4500 iterations saved
mu.vect sd.vect 2.5% 25% 50% 75% 97.5% Rhat n.eff
beta[1] 1.049 2.202 -3.081 -0.484 1.097 2.520 5.844 1.782 6
beta[2] 0.004 0.011 -0.017 -0.001 0.005 0.011 0.026 1.437 8
beta[3] 0.165 0.065 0.034 0.118 0.171 0.213 0.286 1.039 85
beta[4] -0.035 0.011 -0.061 -0.042 -0.033 -0.027 -0.016 1.191 15
beta[5] 0.060 0.015 0.030 0.048 0.060 0.070 0.086 1.411 9
beta[6] 0.071 0.027 0.012 0.048 0.075 0.094 0.110 1.187 16
beta[7] -0.140 0.053 -0.243 -0.178 -0.138 -0.105 -0.039 1.072 33
beta[8] -0.037 0.186 -0.392 -0.171 -0.036 0.092 0.348 1.125 21
deviance 332.333 9.250 316.568 325.541 331.423 337.914 352.942 1.014 150
For each parameter, n.eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor (at convergence, Rhat=1).
DIC info (using the rule, pD = var(deviance)/2)
pD = 42.2 and DIC = 374.6
DIC is an estimate of expected predictive error (lower deviance is better).
glmer()模型如下:
library(Matrix)
library(lme4)
library(ggplot2)
library(lmerTest)
Multi.freq <- glmer(fix ~ h_polcon3 + imf_infl + imf_ue + imf_gd + wdi_trade + dr_eg + wdi_gdppcgr + dpi_erlc + (1 | cname), family = binomial(link="logit"), data=Vars)
Multi.freq
和收益
Generalized linear mixed model fit by maximum likelihood (Laplace Approximation) [
glmerMod]
Family: binomial ( logit )
Formula: fix ~ h_polcon3 + imf_infl + imf_ue + imf_gd + wdi_trade + dr_eg +
wdi_gdppcgr + dpi_erlc + (1 | cname)
Data: Vars
Control: glmerControl(optCtrl = list(maxfun = 20000))
AIC BIC logLik deviance df.resid
446.6 490.7 -213.3 426.6 597
Scaled residuals:
Min 1Q Median 3Q Max
-2.5707 -0.2023 0.0056 0.2251 6.7237
Random effects:
Groups Name Variance Std.Dev.
cname (Intercept) 16.92 4.113
Number of obs: 607, groups: cname, 42
Fixed effects:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.640232 2.362286 -4.504 6.66e-06 ***
h_polcon3 1.025333 1.839488 0.557 0.57725
imf_infl 0.004985 0.010386 0.480 0.63129
imf_ue 0.183076 0.072181 2.536 0.01120 *
imf_gd -0.031592 0.011738 -2.692 0.00711 **
wdi_trade 0.048175 0.016842 2.860 0.00423 **
dr_eg 0.092837 0.029145 3.185 0.00145 **
wdi_gdppcgr -0.118344 0.049786 -2.377 0.01745 *
dpi_erlc -0.100724 0.183591 -0.549 0.58326
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) h_plc3 imf_nf imf_ue imf_gd wd_trd dr_eg wd_gdp
h_polcon3 -0.366
imf_infl -0.077 0.104
imf_ue -0.346 -0.076 0.265
imf_gd 0.052 -0.017 -0.214 -0.507
wdi_trade -0.238 0.099 -0.193 0.193 -0.285
dr_eg -0.585 -0.068 -0.304 0.038 0.043 -0.282
wdi_gdppcgr 0.039 -0.109 0.157 0.040 0.123 -0.295 0.024
dpi_erlc -0.163 0.151 -0.216 -0.005 0.017 0.156 -0.033 -0.122
因此我怀疑在设置数据时我犯了一个错误。谁能告诉我我做错了什么?
我尝试了另一种更简单的方法(删除e.obs),但是,对于贝叶斯和包估计器,我仍然会得到不同的结果:
b1.true <- 2.5 #individual level predictor coefficient
b2.true <- 1.75
mu.a.true <- 0.5
sigma.a.true <- 1.25 #among group variance
#Simulating the varying coefficients
J <- 10 #50 groups
u <- rnorm(J,0, 3) #group level predictor variable
a.true <- rep (NA, J)
for (j in 1:J){
a.true[j] <- rnorm (1, mu.a.true, sigma.a.true)
}
n <- 50 #number of observations
#set.seed(666)
country <- rep(1:J, n)
x1 <- rnorm(n) #individual level predictor variable
x2 <- rnorm(n)
y <- rep (NA, n) #dependent variable
for (i in 1:n){
y[i] <- rbinom(n, 1, 1/(1+exp(-(a.true[country[i]] + b1.true*x1[i] + b2.true*x2[i]))))
}#prob. of the dependent variable described by a linear combination of covariates and country-specific intercepts
#for (i in 1:n){
#y[i] <- rnorm(1, a.true[country[i]] + b.true*x[i], sigma.y.true)
#}
Test <- data.frame(country, x1, x2, y)
#Apply Bayesian estimator to Test dataset
writeLines("
model{
for (i in 1:N){
y[i] ~ dbin(p.bound[i], 1)
p.bound[i] <- max(0, min(1, p[i]))
logit(p[i]) <- Xbeta[i] #likelihood
Xbeta[i] <- beta0 + beta[1]*x1[i] +beta[2]*x2[i] + v[country[i]]
} #linear combination
beta0 ~ dnorm(0,0.001)
beta[1] ~ dnorm(0,0.001) #IV priors
beta[2] ~ dnorm(0,0.001)
for (j in 1:n){
v[j] ~ dnorm(mu.country, tau.country) #country-specific intercept
}
mu.country ~ dnorm(0,0.001)
tau.country <- pow(sigma.country, -2) #hyperparameter priors
sigma.country ~ dunif(0, 100)
}", con="Multi.logit.jag")
y <- Test$y
x1 <- Test$x1
x2 <- Test$x2
N <- 500
country <- Test$country #specify levels
country <- as.factor(country)
nlevels(country) #10 groups
country <- as.numeric(country) #transfer factor variabel to numeric variable in order to make model run
n <- 10
M.data.test <-list('N'=N, 'n'= n, 'y'=y, 'country' = country, 'x1'=x1, 'x2'=x2)
library(coda)
library(rjags)
library(R2jags)
M.test <- jags(data=M.data.test, parameters.to.save="beta", model.file="Multi.logit.jag", n.chains=3, n.iter=2000, n.burnin=500)
print(M.test)
结果仍有偏见:
Inference for Bugs model at "Multi.logit.jag", fit using jags,
3 chains, each with 2000 iterations (first 500 discarded)
n.sims = 4500 iterations saved
mu.vect sd.vect 2.5% 25% 50% 75% 97.5% Rhat n.eff
beta[1] 3.901 0.524 2.938 3.530 3.859 4.256 5.008 1.008 280
beta[2] 1.575 0.245 1.121 1.402 1.570 1.736 2.078 1.002 1500
deviance 273.741 4.814 266.316 270.207 273.118 276.595 284.787 1.002 2100
For each parameter, n.eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor (at convergence, Rhat=1).
DIC info (using the rule, pD = var(deviance)/2)
pD = 11.6 and DIC = 285.3
DIC is an estimate of expected predictive error (lower deviance is better).
lme4:
相同#Test with package
library(Matrix)
library(lme4)
library(ggplot2)
library(lmerTest)
Multi.freq <- glmer(y ~ x1 + x2 + (1 | country), family = binomial(link="logit"), data=Test, glmerControl(optimizer="bobyqa", optCtrl = list(maxfun = 100000)))
summary(Multi.freq)
结果:
Generalized linear mixed model fit by maximum likelihood (Laplace Approximation) [
glmerMod]
Family: binomial ( logit )
Formula: y ~ x1 + x2 + (1 | country)
Data: Test
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))
AIC BIC logLik deviance df.resid
311.7 328.6 -151.9 303.7 496
Scaled residuals:
Min 1Q Median 3Q Max
-2.00320 -0.18842 -0.01074 0.26742 2.08614
Random effects:
Groups Name Variance Std.Dev.
country (Intercept) 6.721 2.593
Number of obs: 500, groups: country, 10
Fixed effects:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.1587 0.8481 0.187 0.852
x1 3.7344 0.5011 7.452 9.22e-14 ***
x2 1.5496 0.2422 6.397 1.58e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) x1
x1 0.049
x2 0.022 0.174