为Cramer Von Mises测试编写函数

时间:2014-04-01 15:44:09

标签: r

来自cvm.test()包的dgof提供了一种在离散分布上进行单样本Cramer-von Mises测试的方法,我的目标是开发一个对连续分布进行测试的函数(如ks.test()包中的Kolmogorov-Smirnov stats。)

注意:这篇文章只涉及完全指定的df null假设,所以请不要在这里进行引导或蒙特卡罗模拟

> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE, 
    B = 2000, tol = 1e-08) 
{
    cvm.pval.disc <- function(STAT, lambda) {
        x <- STAT
        theta <- function(u) {
            VAL <- 0
            for (i in 1:length(lambda)) {
                VAL <- VAL + 0.5 * atan(lambda[i] * u)
            }
            return(VAL - 0.5 * x * u)
        }
        rho <- function(u) {
            VAL <- 0
            for (i in 1:length(lambda)) {
                VAL <- VAL + log(1 + lambda[i]^2 * u^2)
            }
            VAL <- exp(VAL * 0.25)
            return(VAL)
        }
        fun <- function(u) return(sin(theta(u))/(u * rho(u)))
        pval <- 0
        try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi, 
            silent = TRUE)
        if (pval > 0.001) 
            return(pval)
        if (pval <= 0.001) {
            df <- sum(lambda != 0)
            est1 <- dchisq(STAT/max(lambda), df)
            logf <- function(t) {
                ans <- -t * STAT
                ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
                return(ans)
            }
            est2 <- 1
            try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum), 
                silent = TRUE)
            return(min(est1, est2))
        }
    }
    cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
        type <- match.arg(type)
        I <- knots(y)
        N <- length(x)
        e <- diff(c(0, N * y(I)))
        obs <- rep(0, length(I))
        for (j in 1:length(I)) {
            obs[j] <- length(which(x == I[j]))
        }
        S <- cumsum(obs)
        T <- cumsum(e)
        H <- T/N
        p <- e/N
        t <- (p + p[c(2:length(p), 1)])/2
        Z <- S - T
        Zbar <- sum(Z * t)
        S0 <- diag(p) - p %*% t(p)
        A <- matrix(1, length(p), length(p))
        A <- apply(row(A) >= col(A), 2, as.numeric)
        E <- diag(t)
        One <- rep(1, nrow(E))
        K <- diag(0, length(H))
        diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
        Sy <- A %*% S0 %*% t(A)
        M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*% 
            One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*% 
            t(One) %*% E), A2 = E %*% K)
        lambda <- eigen(M %*% Sy)$values
        STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z - 
            Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
        return(c(STAT, lambda))
    }
    cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type, 
        tol, B) {
        knots.y <- knots(y)
        fknots.y <- y(knots.y)
        u <- runif(B * length(x))
        u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) + 
            1]))
        dim(u) <- c(B, length(x))
        s <- apply(u, 1, cvm.stat.disc, y, type)
        s <- s[1, ]
        return(sum(s >= STATISTIC - tol)/B)
    }
    type <- match.arg(type)
    DNAME <- deparse(substitute(x))
    if (is.stepfun(y)) {
        if (length(setdiff(x, knots(y))) != 0) {
            stop("Data are incompatable with null distribution; ", 
                "Note: This function is meant only for discrete distributions ", 
                "you may be receiving this error because y is continuous.")
        }
        tempout <- cvm.stat.disc(x, y, type = type)
        STAT <- tempout[1]
        lambda <- tempout[2:length(tempout)]
        if (!simulate.p.value) {
            PVAL <- cvm.pval.disc(STAT, lambda)
        }
        else {
            PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type, 
                tol, B)
        }
        METHOD <- paste("Cramer-von Mises -", type)
        names(STAT) <- as.character(type)
        RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided", 
            method = METHOD, data.name = DNAME)
    }
    else {
        stop("Null distribution must be a discrete.")
    }
    class(RVAL) <- "htest"
    return(RVAL)
}
<environment: namespace:dgof>

来自ks.test()包的Kolmogorov-Smirnov stats进行比较(请注意,此函数同时执行单样本和双样本测试):

> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"), 
    exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000) 
{
    pkolmogorov1x <- function(x, n) {
        if (x <= 0) 
            return(0)
        if (x >= 1) 
            return(1)
        j <- seq.int(from = 0, to = floor(n * (1 - x)))
        1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x - 
            j/n) + (j - 1) * log(x + j/n)))
    }
    exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y, 
        tol) {
        ts.pval <- function(S, x, n, y, knots.y, tol) {
            f_n <- ecdf(x)
            eps <- min(tol, min(diff(knots.y)) * tol)
            eps2 <- min(tol, min(diff(y(knots.y))) * tol)
            a <- rep(0, n)
            b <- a
            f_a <- a
            for (i in 1:n) {
                a[i] <- min(c(knots.y[which(y(knots.y) + S >= 
                  i/n + eps2)[1]], Inf), na.rm = TRUE)
                b[i] <- min(c(knots.y[which(y(knots.y) - S > 
                  (i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
                f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]), 
                  y(a[i] - eps))
            }
            f_b <- y(b)
            p <- rep(1, n + 1)
            for (i in 1:n) {
                tmp <- 0
                for (k in 0:(i - 1)) {
                  tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) * 
                    max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k + 
                    1]
                }
                p[i + 1] <- tmp
            }
            p <- max(0, 1 - p[n + 1])
            if (p > 1) {
                warning("numerical instability in p-value calculation.")
                p <- 1
            }
            return(p)
        }
        less.pval <- function(S, n, H, z, tol) {
            m <- ceiling(n * (1 - S))
            c <- S + (1:m - 1)/n
            CDFVAL <- H(sort(z))
            for (j in 1:length(c)) {
                ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 - 
                  c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j], 
                  CDFVAL)) == 1)])
            }
            b <- rep(0, m)
            b[1] <- 1
            for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k, 
                1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
            p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) * 
                b)
            return(p)
        }
        greater.pval <- function(S, n, H, z, tol) {
            m <- ceiling(n * (1 - S))
            c <- 1 - (S + (1:m - 1)/n)
            CDFVAL <- c(0, H(sort(z)))
            for (j in 1:length(c)) {
                if (!(min(abs(c[j] - CDFVAL)) < tol)) 
                  c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) == 
                    1) - 1]
            }
            b <- rep(0, m)
            b[1] <- 1
            for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k, 
                1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
            p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) * 
                b)
            return(p)
        }
        p <- switch(alternative, two.sided = ts.pval(STATISTIC, 
            x, n, y, knots.y, tol), less = less.pval(STATISTIC, 
            n, y, knots.y, tol), greater = greater.pval(STATISTIC, 
            n, y, knots.y, tol))
        return(p)
    }
    sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y, 
        tol, B) {
        fknots.y <- y(knots.y)
        u <- runif(B * length(x))
        u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) + 
            1]))
        dim(u) <- c(B, length(x))
        getks <- function(a, knots.y, fknots.y) {
            dev <- c(0, ecdf(a)(knots.y) - fknots.y)
            STATISTIC <- switch(alternative, two.sided = max(abs(dev)), 
                greater = max(dev), less = max(-dev))
            return(STATISTIC)
        }
        s <- apply(u, 1, getks, knots.y, fknots.y)
        return(sum(s >= STATISTIC - tol)/B)
    }
    alternative <- match.arg(alternative)
    DNAME <- deparse(substitute(x))
    x <- x[!is.na(x)]
    n <- length(x)
    if (n < 1L) 
        stop("not enough 'x' data")
    PVAL <- NULL
    if (is.numeric(y)) {
        DNAME <- paste(DNAME, "and", deparse(substitute(y)))
        y <- y[!is.na(y)]
        n.x <- as.double(n)
        n.y <- length(y)
        if (n.y < 1L) 
            stop("not enough 'y' data")
        if (is.null(exact)) 
            exact <- (n.x * n.y < 10000)
        METHOD <- "Two-sample Kolmogorov-Smirnov test"
        TIES <- FALSE
        n <- n.x * n.y/(n.x + n.y)
        w <- c(x, y)
        z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
        if (length(unique(w)) < (n.x + n.y)) {
            warning("cannot compute correct p-values with ties")
            z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
            TIES <- TRUE
        }
        STATISTIC <- switch(alternative, two.sided = max(abs(z)), 
            greater = max(z), less = -min(z))
        nm_alternative <- switch(alternative, two.sided = "two-sided", 
            less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
        if (exact && (alternative == "two.sided") && !TIES) 
            PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC), 
                as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
    }
    else if (is.stepfun(y)) {
        z <- knots(y)
        if (is.null(exact)) 
            exact <- (n <= 30)
        if (exact && n > 30) {
            warning("numerical instability may affect p-value")
        }
        METHOD <- "One-sample Kolmogorov-Smirnov test"
        dev <- c(0, ecdf(x)(z) - y(z))
        STATISTIC <- switch(alternative, two.sided = max(abs(dev)), 
            greater = max(dev), less = max(-dev))
        if (simulate.p.value) {
            PVAL <- sim.pval(alternative, STATISTIC, x, n, y, 
                z, tol, B)
        }
        else {
            PVAL <- switch(exact, `TRUE` = exact.pval(alternative, 
                STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
        }
        nm_alternative <- switch(alternative, two.sided = "two-sided", 
            less = "the CDF of x lies below the null hypothesis", 
            greater = "the CDF of x lies above the null hypothesis")
    }
    else {
        if (is.character(y)) 
            y <- get(y, mode = "function")
        if (mode(y) != "function") 
            stop("'y' must be numeric or a string naming a valid function")
        if (is.null(exact)) 
            exact <- (n < 100)
        METHOD <- "One-sample Kolmogorov-Smirnov test"
        TIES <- FALSE
        if (length(unique(x)) < n) {
            warning(paste("default ks.test() cannot compute correct p-values with ties;\n", 
                "see help page for one-sample Kolmogorov test for discrete distributions."))
            TIES <- TRUE
        }
        x <- y(sort(x), ...) - (0:(n - 1))/n
        STATISTIC <- switch(alternative, two.sided = max(c(x, 
            1/n - x)), greater = max(1/n - x), less = max(x))
        if (exact && !TIES) {
            PVAL <- if (alternative == "two.sided") 
                1 - .C("pkolmogorov2x", p = as.double(STATISTIC), 
                  as.integer(n), PACKAGE = "dgof")$p
            else 1 - pkolmogorov1x(STATISTIC, n)
        }
        nm_alternative <- switch(alternative, two.sided = "two-sided", 
            less = "the CDF of x lies below the null hypothesis", 
            greater = "the CDF of x lies above the null hypothesis")
    }
    names(STATISTIC) <- switch(alternative, two.sided = "D", 
        greater = "D^+", less = "D^-")
    pkstwo <- function(x, tol = 1e-06) {
        if (is.numeric(x)) 
            x <- as.vector(x)
        else stop("argument 'x' must be numeric")
        p <- rep(0, length(x))
        p[is.na(x)] <- NA
        IND <- which(!is.na(x) & (x > 0))
        if (length(IND)) {
            p[IND] <- .C("pkstwo", as.integer(length(x[IND])), 
                p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
        }
        return(p)
    }
    if (is.null(PVAL)) {
        PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) * 
            STATISTIC), exp(-2 * n * STATISTIC^2))
    }
    RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative, 
        method = METHOD, data.name = DNAME)
    class(RVAL) <- "htest"
    return(RVAL)
}
<environment: namespace:dgof>

0 个答案:

没有答案