Question

我的负二项模型适用于我的数据如下：

> ngbinmodel <- glm.nb( seizure.rate ~ age + treatment, data = epilepsy_reduced)
>     summary(ngbinmodel)

Call:
glm.nb(formula = seizure.rate ~ age + treatment, data = epilepsy_reduced, 
    init.theta = 1.498983674, link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3510  -0.8790  -0.4563   0.4328   1.8916  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.0985089  0.5845392   3.590 0.000331 ***
age         -0.0007965  0.0193064  -0.041 0.967092    
treatment   -0.5011593  0.2405658  -2.083 0.037228 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for Negative Binomial(1.499) family taken to be 1)

    Null deviance: 71.217  on 57  degrees of freedom
Residual deviance: 66.875  on 55  degrees of freedom
AIC: 341.12

Number of Fisher Scoring iterations: 1


              Theta:  1.499 
          Std. Err.:  0.362 

 2 x log-likelihood:  -333.119

现在我想检查是否应该包括年龄和治疗之间的交互作用。我找到了两种方法：

  > intearaction_nbm<-addterm(ngbinmodel, . ~ . * age,test="Chisq")
            >     summary(intearaction_nbm)
                   Df         AIC             LRT            Pr(Chi)      
             Min.   :1   Min.   :339.1   Min.   :0.9383   Min.   :0.3327  
             1st Qu.:1   1st Qu.:339.4   1st Qu.:0.9383   1st Qu.:0.3327  
             Median :1   Median :339.6   Median :0.9383   Median :0.3327  
             Mean   :1   Mean   :339.6   Mean   :0.9383   Mean   :0.3327  
             3rd Qu.:1   3rd Qu.:339.9   3rd Qu.:0.9383   3rd Qu.:0.3327  
             Max.   :1   Max.   :340.2   Max.   :0.9383   Max.   :0.3327  
             NA's   :1                   NA's   :1        NA's   :1

和

> ngbinmodel_int <- glm.nb( seizure.rate ~ age*treatment, data = epilepsy_reduced)
>     summary(ngbinmodel_int)
glm.nb(formula = seizure.rate ~ age * treatment, data = epilepsy_reduced, 
    init.theta = 1.531539174, link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3503  -0.8742  -0.3848   0.3403   1.8508  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)  
(Intercept)    1.51361    0.83920   1.804   0.0713 .
age            0.01914    0.02826   0.677   0.4981  
treatment      0.60748    1.12199   0.541   0.5882  
age:treatment -0.03893    0.03850  -1.011   0.3119  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for Negative Binomial(1.5315) family taken to be 1)

    Null deviance: 72.238  on 57  degrees of freedom
Residual deviance: 66.874  on 54  degrees of freedom
AIC: 342.18

Number of Fisher Scoring iterations: 1


              Theta:  1.532 
          Std. Err.:  0.373 

 2 x log-likelihood:  -332.180

我期望从这两种方法中获得相同的结果。

如何访问intearaction_nbm的回归估算值？

为什么结果会有所不同？根据intearaction_nbm我应该包含交互项（AIC较低）但是根据ngbinmodel_int我不应该包含交互项（AIC增加）。

会不断对我的连续变量age进行离散化吗？

Answer 1

备注：您应该移动此帖子以进行交叉验证。

如何访问intearaction_nbm的回归估计值？

intearaction_nbm为您提供了为模型添加单项的结果，如果您打印它，每个可能的附加项（age:treatment）都会有一行， age:another_variable等）给你AIC和P值等等。

为什么结果不同？

没有数据就无法回答，但我要做的是定义两个模型并使用AIC(model_1, model_2)比较他们的AIC。这样我确信我正在比较相同的数量。如您所知，AIC最多定义为一个附加项，除非您检查它的计算方式，否则您无法确定两个不同包中的两个不同函数是否使用相同的定义。

会建议我的连续变量年龄离散吗？

没有数据就无法回答......

Answer 2

让我们考虑数据集quine和以下模型，仅考虑Eth和Lrn因素的主要影响：

library(MASS)

negbin_no_int <- glm.nb(Days ~ Eth + Lrn, data = quine)
summary(negbin_no_int)

# Coefficients:
#             Estimate Std. Error z value Pr(>|z|)    
# (Intercept)   3.0367     0.1334  22.764  < 2e-16 ***
# EthN         -0.5520     0.1597  -3.457 0.000546 ***
# LrnSL         0.0388     0.1611   0.241 0.809661

extractAIC(negbin_no_int)

# [1]    3.000 1112.576

具有两个因素之间的相互作用项的模型是：

negbin_with_int <- glm.nb(Days ~ Eth * Lrn, data = quine)
summary(negbin_with_int)

# Coefficients:
#             Estimate Std. Error z value Pr(>|z|)    
# (Intercept)   2.9218     0.1503  19.446   <2e-16 ***
# EthN         -0.3374     0.2100  -1.607    0.108    
# LrnSL         0.2929     0.2307   1.269    0.204    
# EthN:LrnSL   -0.4956     0.3201  -1.549    0.122

extractAIC(negbin_with_int)
# [1]    4.000 1112.196

交互项的统计显着性为p=0.122 现在我们使用addterm比较两个模型：

interaction_nbm <- addterm(negbin_no_int, . ~ . + Eth:Lrn, test="Chisq")
print(interaction_nbm)

# Model:
# Days ~ Eth + Lrn
#         Df    AIC    LRT Pr(Chi)
# <none>     1112.6               
# Eth:Lrn  1 1112.2 2.3804  0.1229

addterm给出的AIC与使用extractAIC计算的相同。

如果您想查看addterm的回归估算值，可以在函数内添加summary(print(nfit))，如下所示：

myaddterm <- function (object, scope, scale = 0, test = c("none", "Chisq"), 
    k = 2, sorted = FALSE, trace = FALSE, ...) 
{
    if (missing(scope) || is.null(scope)) 
        stop("no terms in scope")
    if (!is.character(scope)) 
        scope <- add.scope(object, update.formula(object, scope))
    if (!length(scope)) 
        stop("no terms in scope for adding to object")
    ns <- length(scope)
    ans <- matrix(nrow = ns + 1L, ncol = 2L, dimnames = list(c("<none>", 
        scope), c("df", "AIC")))
    ans[1L, ] <- extractAIC(object, scale, k = k, ...)
    n0 <- nobs(object, use.fallback = TRUE)
    env <- environment(formula(object))
    for (i in seq_len(ns)) {
        tt <- scope[i]
        if (trace) {
            message(gettextf("trying + %s", tt), domain = NA)
            utils::flush.console()
        }
        nfit <- update(object, as.formula(paste("~ . +", tt)), 
            evaluate = FALSE)
        nfit <- try(eval(nfit, envir = env), silent = TRUE)
        print(summary(nfit))
        ans[i + 1L, ] <- if (!inherits(nfit, "try-error")) {
            nnew <- nobs(nfit, use.fallback = TRUE)
            if (all(is.finite(c(n0, nnew))) && nnew != n0) 
                stop("number of rows in use has changed: remove missing values?")
            extractAIC(nfit, scale, k = k, ...)
        }
        else NA_real_
    }
    dfs <- ans[, 1L] - ans[1L, 1L]
    dfs[1L] <- NA
    aod <- data.frame(Df = dfs, AIC = ans[, 2L])
    o <- if (sorted) 
        order(aod$AIC)
    else seq_along(aod$AIC)
    test <- match.arg(test)
    if (test == "Chisq") {
        dev <- ans[, 2L] - k * ans[, 1L]
        dev <- dev[1L] - dev
        dev[1L] <- NA
        nas <- !is.na(dev)
        P <- dev
        P[nas] <- MASS:::safe_pchisq(dev[nas], dfs[nas], lower.tail = FALSE)
        aod[, c("LRT", "Pr(Chi)")] <- list(dev, P)
    }
    aod <- aod[o, ]
    head <- c("Single term additions", "\nModel:", deparse(formula(object)))
    if (scale > 0) 
        head <- c(head, paste("\nscale: ", format(scale), "\n"))
    class(aod) <- c("anova", "data.frame")
    attr(aod, "heading") <- head
    aod
}
interaction_nbm1 <- myaddterm(negbin_no_int, . ~ . + Eth:Lrn, test="Chisq")

输出结果为：

Call:
glm.nb(formula = Days ~ Eth + Lrn + Eth:Lrn, data = quine, init.theta = 1.177546225, 
    link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5770  -1.0470  -0.3645   0.3521   2.7227  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   2.9218     0.1503  19.446   <2e-16 ***
EthN         -0.3374     0.2100  -1.607    0.108    
LrnSL         0.2929     0.2307   1.269    0.204    
EthN:LrnSL   -0.4956     0.3201  -1.549    0.122    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for Negative Binomial(1.1775) family taken to be 1)

    Null deviance: 182.93  on 145  degrees of freedom
Residual deviance: 168.18  on 142  degrees of freedom
AIC: 1114.2

Number of Fisher Scoring iterations: 1


              Theta:  1.178 
          Std. Err.:  0.146 

 2 x log-likelihood:  -1104.196

R

2 个答案: