在R

时间:2015-11-11 09:33:15

标签: r glm newtons-method

在拟合glm modelel(来自统计数据包)时,R中是否有一个包绘制newton-raphson / fisher评分迭代的包?

1 个答案:

答案 0 :(得分:0)

Programming to an interface。然而,在你的情况下,事情有点简单。

请注意,当您调用glm时,它最终调用glm.fit(或您指定给method的任何其他glm参数),该参数计算循环中的解决方案路径使用.Call到C函数C_Cdqrls在第97行计算当前迭代的系数值。作为一个黑客,您可以通过修改fit$coefficients函数,在此循环内将系数的当前值提取到全局环境(glm.fit):

glm.fit.new = function (x, y, weights = rep(1, nobs), start = NULL, etastart = NULL, 
                        mustart = NULL, offset = rep(0, nobs), family = gaussian(), 
                        control = list(), intercept = TRUE) {
                          control <- do.call("glm.control", control)
                          x <- as.matrix(x)
                          xnames <- dimnames(x)[[2L]]
                          ynames <- if (is.matrix(y)) 
                            rownames(y)
                          else names(y)
                          conv <- FALSE
                          nobs <- NROW(y)
                          nvars <- ncol(x)
                          EMPTY <- nvars == 0
                          if (is.null(weights)) 
                            weights <- rep.int(1, nobs)
                          if (is.null(offset)) 
                            offset <- rep.int(0, nobs)
                          variance <- family$variance
                          linkinv <- family$linkinv
                          if (!is.function(variance) || !is.function(linkinv)) 
                            stop("'family' argument seems not to be a valid family object", 
                                 call. = FALSE)
                          dev.resids <- family$dev.resids
                          aic <- family$aic
                          mu.eta <- family$mu.eta
                          unless.null <- function(x, if.null) if (is.null(x)) 
                            if.null
                          else x
                          valideta <- unless.null(family$valideta, function(eta) TRUE)
                          validmu <- unless.null(family$validmu, function(mu) TRUE)
                          if (is.null(mustart)) {
                            eval(family$initialize)
                          }
                          else {
                            mukeep <- mustart
                            eval(family$initialize)
                            mustart <- mukeep
                          }
                          if (EMPTY) {
                            eta <- rep.int(0, nobs) + offset
                            if (!valideta(eta)) 
                              stop("invalid linear predictor values in empty model", 
                                   call. = FALSE)
                            mu <- linkinv(eta)
                            if (!validmu(mu)) 
                              stop("invalid fitted means in empty model", call. = FALSE)
                            dev <- sum(dev.resids(y, mu, weights))
                            w <- ((weights * mu.eta(eta)^2)/variance(mu))^0.5
                            residuals <- (y - mu)/mu.eta(eta)
                            good <- rep_len(TRUE, length(residuals))
                            boundary <- conv <- TRUE
                            coef <- numeric()
                            iter <- 0L
                          }
                          else {
                            coefold <- NULL
                            eta <- if (!is.null(etastart)) 
                              etastart
                            else if (!is.null(start)) 
                              if (length(start) != nvars) 
                                stop(gettextf("length of 'start' should equal %d and correspond to initial coefs for %s", 
                                              nvars, paste(deparse(xnames), collapse = ", ")), 
                                     domain = NA)
                            else {
                              coefold <- start
                              offset + as.vector(if (NCOL(x) == 1L) 
                                x * start
                                else x %*% start)
                            }
                            else family$linkfun(mustart)
                            mu <- linkinv(eta)
                            if (!(validmu(mu) && valideta(eta))) 
                              stop("cannot find valid starting values: please specify some", 
                                   call. = FALSE)
                            devold <- sum(dev.resids(y, mu, weights))
                            boundary <- conv <- FALSE

                            # EDIT: counter to create track of iterations
                            i <<- 1
                            for (iter in 1L:control$maxit) {
                              good <- weights > 0
                              varmu <- variance(mu)[good]
                              if (anyNA(varmu)) 
                                stop("NAs in V(mu)")
                              if (any(varmu == 0)) 
                                stop("0s in V(mu)")
                              mu.eta.val <- mu.eta(eta)
                              if (any(is.na(mu.eta.val[good]))) 
                                stop("NAs in d(mu)/d(eta)")
                              good <- (weights > 0) & (mu.eta.val != 0)
                              if (all(!good)) {
                                conv <- FALSE
                                warning(gettextf("no observations informative at iteration %d", 
                                                 iter), domain = NA)
                                break
                              }
                              z <- (eta - offset)[good] + (y - mu)[good]/mu.eta.val[good]
                              w <- sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good])
                              fit <- .Call(stats:::C_Cdqrls, x[good, , drop = FALSE] * 
                                             w, z * w, min(1e-07, control$epsilon/1000), check = FALSE)

                              #======================================================
                              # EDIT: assign the coefficients to variables in the global namespace
                              #======================================================
                              assign(paste0("iteration_x_", i), fit$coefficients, 
                                     envir = .GlobalEnv)
                              i <<- i + 1   # increase the counter

                              if (any(!is.finite(fit$coefficients))) {
                                conv <- FALSE
                                warning(gettextf("non-finite coefficients at iteration %d", 
                                                 iter), domain = NA)
                                break
                              }
                              if (nobs < fit$rank) 
                                stop(sprintf(ngettext(nobs, "X matrix has rank %d, but only %d observation", 
                                                      "X matrix has rank %d, but only %d observations"), 
                                             fit$rank, nobs), domain = NA)
                              start[fit$pivot] <- fit$coefficients
                              eta <- drop(x %*% start)
                              mu <- linkinv(eta <- eta + offset)
                              dev <- sum(dev.resids(y, mu, weights))
                              if (control$trace) 
                                cat("Deviance = ", dev, " Iterations - ", iter, 
                                    "\n", sep = "")
                              boundary <- FALSE
                              if (!is.finite(dev)) {
                                if (is.null(coefold)) 
                                  stop("no valid set of coefficients has been found: please supply starting values", 
                                       call. = FALSE)
                                warning("step size truncated due to divergence", 
                                        call. = FALSE)
                                ii <- 1
                                while (!is.finite(dev)) {
                                  if (ii > control$maxit) 
                                    stop("inner loop 1; cannot correct step size", 
                                         call. = FALSE)
                                  ii <- ii + 1
                                  start <- (start + coefold)/2
                                  eta <- drop(x %*% start)
                                  mu <- linkinv(eta <- eta + offset)
                                  dev <- sum(dev.resids(y, mu, weights))
                                }
                                boundary <- TRUE
                                if (control$trace) 
                                  cat("Step halved: new deviance = ", dev, "\n", 
                                      sep = "")
                              }
                              if (!(valideta(eta) && validmu(mu))) {
                                if (is.null(coefold)) 
                                  stop("no valid set of coefficients has been found: please supply starting values", 
                                       call. = FALSE)
                                warning("step size truncated: out of bounds", 
                                        call. = FALSE)
                                ii <- 1
                                while (!(valideta(eta) && validmu(mu))) {
                                  if (ii > control$maxit) 
                                    stop("inner loop 2; cannot correct step size", 
                                         call. = FALSE)
                                  ii <- ii + 1
                                  start <- (start + coefold)/2
                                  eta <- drop(x %*% start)
                                  mu <- linkinv(eta <- eta + offset)
                                }
                                boundary <- TRUE
                                dev <- sum(dev.resids(y, mu, weights))
                                if (control$trace) 
                                  cat("Step halved: new deviance = ", dev, "\n", 
                                      sep = "")
                              }
                              if (abs(dev - devold)/(0.1 + abs(dev)) < control$epsilon) {
                                conv <- TRUE
                                coef <- start
                                break
                              }
                              else {
                                devold <- dev
                                coef <- coefold <- start
                              }
                            }
                            if (!conv) 
                              warning("glm.fit: algorithm did not converge", call. = FALSE)
                            if (boundary) 
                              warning("glm.fit: algorithm stopped at boundary value", 
                                      call. = FALSE)
                            eps <- 10 * .Machine$double.eps
                            if (family$family == "binomial") {
                              if (any(mu > 1 - eps) || any(mu < eps)) 
                                warning("glm.fit: fitted probabilities numerically 0 or 1 occurred", 
                                        call. = FALSE)
                            }
                            if (family$family == "poisson") {
                              if (any(mu < eps)) 
                                warning("glm.fit: fitted rates numerically 0 occurred", 
                                        call. = FALSE)
                            }
                            if (fit$rank < nvars) 
                              coef[fit$pivot][seq.int(fit$rank + 1, nvars)] <- NA
                            xxnames <- xnames[fit$pivot]
                            residuals <- (y - mu)/mu.eta(eta)
                            fit$qr <- as.matrix(fit$qr)
                            nr <- min(sum(good), nvars)
                            if (nr < nvars) {
                              Rmat <- diag(nvars)
                              Rmat[1L:nr, 1L:nvars] <- fit$qr[1L:nr, 1L:nvars]
                            }
                            else Rmat <- fit$qr[1L:nvars, 1L:nvars]
                            Rmat <- as.matrix(Rmat)
                            Rmat[row(Rmat) > col(Rmat)] <- 0
                            names(coef) <- xnames
                            colnames(fit$qr) <- xxnames
                            dimnames(Rmat) <- list(xxnames, xxnames)
                          }
                          names(residuals) <- ynames
                          names(mu) <- ynames
                          names(eta) <- ynames
                          wt <- rep.int(0, nobs)
                          wt[good] <- w^2
                          names(wt) <- ynames
                          names(weights) <- ynames
                          names(y) <- ynames
                          if (!EMPTY) 
                            names(fit$effects) <- c(xxnames[seq_len(fit$rank)], rep.int("", 
                                                                                        sum(good) - fit$rank))
                          wtdmu <- if (intercept) 
                            sum(weights * y)/sum(weights)
                          else linkinv(offset)
                          nulldev <- sum(dev.resids(y, wtdmu, weights))
                          n.ok <- nobs - sum(weights == 0)
                          nulldf <- n.ok - as.integer(intercept)
                          rank <- if (EMPTY) 
                            0
                          else fit$rank
                          resdf <- n.ok - rank
                          aic.model <- aic(y, n, mu, weights, dev) + 2 * rank
                          list(coefficients = coef, residuals = residuals, fitted.values = mu, 
                               effects = if (!EMPTY) fit$effects, R = if (!EMPTY) Rmat, 
                               rank = rank, qr = if (!EMPTY) structure(fit[c("qr", "rank", 
                                                                             "qraux", "pivot", "tol")], class = "qr"), family = family, 
                               linear.predictors = eta, deviance = dev, aic = aic.model, 
                               null.deviance = nulldev, iter = iter, weights = wt, prior.weights = weights, 
                               df.residual = resdf, df.null = nulldf, y = y, converged = conv, 
                               boundary = boundary)
                        }

请注意,由于以下几个原因,这是一个黑客攻击:
1.程序C_Cdrqls不会导出stats函数,因此我们必须在namespace:package:stats内查找。{1}}。 2.通过调用glm.fit.new的副作用,使用迭代值污染您的全局环境,每次迭代创建一个向量。副作用通常在像R这样的函数式语言中不受欢迎。您可以通过创建矩阵或data.frame并在其中进行分配来清理多个对象。

但是,一旦提取了迭代值,您就可以随意使用它们,包括绘制它们。

以下是使用新定义的glm方法调用glm.fit.new的方式:

counts = c(18,17,15,20,10,20,25,13,12)
outcome = gl(3,1,9)
treatment = gl(3,3)
print(d.AD = data.frame(treatment, outcome, counts))
glm.D93 = glm(counts ~ outcome + treatment, family = poisson(), 
               control = list(trace = TRUE, epsilon = 1e-16), method = "glm.fit.new")

您可以检查迭代参数值是否确实填充在全局环境中:

> ls(pattern = "iteration_x_")
 [1] "iteration_x_1"  "iteration_x_10" "iteration_x_11" "iteration_x_2" 
 [5] "iteration_x_3"  "iteration_x_4"  "iteration_x_5"  "iteration_x_6" 
 [9] "iteration_x_7"  "iteration_x_8"  "iteration_x_9"