在R中为引导功能添加进度条

时间:2016-06-07 08:13:38

标签: r function loops progress bootstrapping

我正在尝试在R中的引导函数中添加进度条。 我试图使示例函数尽可能简单(因此我在本例中使用mean)。

library(boot)
v1 <- rnorm(1000)
rep_count = 1

m.boot <- function(data, indices) {
  d <- data[indices]
  setWinProgressBar(pb, rep_count)
  rep_count <- rep_count + 1
  Sys.sleep(0.01)
  mean(d, na.rm = T) 
  }

tot_rep <- 200
pb <- winProgressBar(title = "Bootstrap in progress", label = "",
                     min = 0, max = tot_rep, initial = 0, width = 300)
b <- boot(v1, m.boot, R = tot_rep)
close(pb)

引导程序正常运行,但问题是rep_count的值在循环中没有增加,进度条在此过程中保持冻结状态。

如果我在bootstrap完成后检查rep_count的值,它仍然是1.

我做错了什么?也许启动函数不是简单地在循环中插入m.boot函数,因此其中的变量不会增加?

谢谢。

6 个答案:

答案 0 :(得分:2)

增加的rep_count是一个局部变量,在每次函数调用后丢失。在下一次迭代中,函数再次从全局环境中获取rep_count,即其值为1.

您可以使用<<-

rep_count <<- rep_count + 1

这将分配给在函数外部的搜索路径上首次找到的rep_count。当然,通常不推荐使用<<-,因为应避免使用函数的副作用,但这里有一个合法的用例。但是,您应该将整个事物包装在一个函数中,以避免对全局环境产生副作用。

可能有更好的解决方案......

答案 1 :(得分:2)

pbapply 包旨在用于矢量化函数。在这个问题的上下文中有两种方法可以实现:(1)按照建议写一个包装器,它不会产生类'boot'的相同对象; (2)或者,行lapply(seq_len(RR), fn)可以写为pblapply(seq_len(RR), fn)。选项2可以通过本地复制/更新boot函数来实现,如下例所示,或者询问软件包维护者Brian Ripley是否考虑直接或通过 pbapply添加进度条作为依赖。

我的解决方案(评论指出的变化):

library(boot)
library(pbapply)
boot2 <- function (data, statistic, R, sim = "ordinary", stype = c("i", 
    "f", "w"), strata = rep(1, n), L = NULL, m = 0, weights = NULL, 
    ran.gen = function(d, p) d, mle = NULL, simple = FALSE, ..., 
    parallel = c("no", "multicore", "snow"), ncpus = getOption("boot.ncpus", 
        1L), cl = NULL) 
{
call <- match.call()
stype <- match.arg(stype)
if (missing(parallel)) 
    parallel <- getOption("boot.parallel", "no")
parallel <- match.arg(parallel)
have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) {
    if (parallel == "multicore") 
        have_mc <- .Platform$OS.type != "windows"
    else if (parallel == "snow") 
        have_snow <- TRUE
    if (!have_mc && !have_snow) 
        ncpus <- 1L
    loadNamespace("parallel")
}
if (simple && (sim != "ordinary" || stype != "i" || sum(m))) {
    warning("'simple=TRUE' is only valid for 'sim=\"ordinary\", stype=\"i\", n=0', so ignored")
    simple <- FALSE
}
if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) 
    runif(1)
seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
n <- NROW(data)
if ((n == 0) || is.null(n)) 
    stop("no data in call to 'boot'")
temp.str <- strata
strata <- tapply(seq_len(n), as.numeric(strata))
t0 <- if (sim != "parametric") {
    if ((sim == "antithetic") && is.null(L)) 
        L <- empinf(data = data, statistic = statistic, stype = stype, 
            strata = strata, ...)
    if (sim != "ordinary") 
        m <- 0
    else if (any(m < 0)) 
        stop("negative value of 'm' supplied")
    if ((length(m) != 1L) && (length(m) != length(table(strata)))) 
        stop("length of 'm' incompatible with 'strata'")
    if ((sim == "ordinary") || (sim == "balanced")) {
        if (isMatrix(weights) && (nrow(weights) != length(R))) 
            stop("dimensions of 'R' and 'weights' do not match")
    }
    else weights <- NULL
    if (!is.null(weights)) 
        weights <- t(apply(matrix(weights, n, length(R), 
            byrow = TRUE), 2L, normalize, strata))
    if (!simple) 
        i <- index.array(n, R, sim, strata, m, L, weights)
    original <- if (stype == "f") 
        rep(1, n)
    else if (stype == "w") {
        ns <- tabulate(strata)[strata]
        1/ns
    }
    else seq_len(n)
    t0 <- if (sum(m) > 0L) 
        statistic(data, original, rep(1, sum(m)), ...)
    else statistic(data, original, ...)
    rm(original)
    t0
}
else statistic(data, ...)
pred.i <- NULL
fn <- if (sim == "parametric") {
    ran.gen
    data
    mle
    function(r) {
        dd <- ran.gen(data, mle)
        statistic(dd, ...)
    }
}
else {
    if (!simple && ncol(i) > n) {
        pred.i <- as.matrix(i[, (n + 1L):ncol(i)])
        i <- i[, seq_len(n)]
    }
    if (stype %in% c("f", "w")) {
        f <- freq.array(i)
        rm(i)
        if (stype == "w") 
            f <- f/ns
        if (sum(m) == 0L) 
            function(r) statistic(data, f[r, ], ...)
        else function(r) statistic(data, f[r, ], pred.i[r, 
            ], ...)
    }
    else if (sum(m) > 0L) 
        function(r) statistic(data, i[r, ], pred.i[r, ], 
            ...)
    else if (simple) 
        function(r) statistic(data, index.array(n, 1, sim, 
            strata, m, L, weights), ...)
    else function(r) statistic(data, i[r, ], ...)
}
RR <- sum(R)
res <- if (ncpus > 1L && (have_mc || have_snow)) {
    if (have_mc) {
        parallel::mclapply(seq_len(RR), fn, mc.cores = ncpus)
    }
    else if (have_snow) {
        list(...)
        if (is.null(cl)) {
            cl <- parallel::makePSOCKcluster(rep("localhost", 
              ncpus))
            if (RNGkind()[1L] == "L'Ecuyer-CMRG") 
              parallel::clusterSetRNGStream(cl)
            res <- parallel::parLapply(cl, seq_len(RR), fn)
            parallel::stopCluster(cl)
            res
        }
        else parallel::parLapply(cl, seq_len(RR), fn)
    }
}
else pblapply(seq_len(RR), fn) #### changed !!!
t.star <- matrix(, RR, length(t0))
for (r in seq_len(RR)) t.star[r, ] <- res[[r]]
if (is.null(weights)) 
    weights <- 1/tabulate(strata)[strata]
boot.return(sim, t0, t.star, temp.str, R, data, statistic, 
    stype, call, seed, L, m, pred.i, weights, ran.gen, mle)
}
## Functions not exported by boot
isMatrix <- boot:::isMatrix
index.array <- boot:::index.array
boot.return <- boot:::boot.return
## Now the example
m.boot <- function(data, indices) {
  d <- data[indices]
  mean(d, na.rm = T) 
}
tot_rep <- 200
v1 <- rnorm(1000)
b <- boot2(v1, m.boot, R = tot_rep)

答案 2 :(得分:1)

我想我找到了一个可能的解决方案。这将@Roland的答案与pbapply包的便利性合并,使用其函数startpb()closepb()等。

library(boot)
library(pbapply)

v1 <- rnorm(1000)
rep_count = 1
tot_rep = 200

m.boot <- function(data, indices) {
  d <- data[indices]
  setpb(pb, rep_count)
  rep_count <<- rep_count + 1
  Sys.sleep(0.01)                #Just to slow down the process
  mean(d, na.rm = T) 
}

pb <- startpb(min = 0, max = tot_rep)
b <- boot(v1, m.boot, R = tot_rep)
closepb(pb)
rep_count = 1

如前所述,将函数中的所有内容包装起来可以避免弄乱rep_count变量。

答案 3 :(得分:1)

dplyr的进度条效果很好:

library(dplyr)
library(boot)

v1 <- rnorm(1000)

m.boot <- function(data, indices) {
  d <- data[indices]
  p$tick()$print()  # update progress bar
  Sys.sleep(0.01)
  mean(d, na.rm = T) 
}

tot_rep <- 200
p <- progress_estimated(tot_rep+1)  # init progress bar
b <- boot(v1, m.boot, R = tot_rep)

答案 4 :(得分:0)

您可以使用包pbapply

library(boot)
library(pbapply)
v1 <- rnorm(1000)
rep_count = 1

# your m.boot function ....
m.boot <- function(data, indices) {
                                   d <- data[indices]
                                   mean(d, na.rm = T) 
                                   }

# ... wraped in `bootfunc`
bootfunc <- function(x) { boot(x, m.boot, R = 200) }

# apply function to v1 , returning progress bar
pblapply(v1, bootfunc)

# > b <- pblapply(v1, bootfunc)
# >   |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% Elapsed time: 02s

答案 5 :(得分:0)

您可以按以下方式使用软件包progress

prior_task = prior_task >> PythonOperator

我还没有弄清楚为什么需要将library(boot) library(progress) v1 <- rnorm(1000) #add progress bar as parameter to function m.boot <- function(data, indices, prog) { #display progress with each run of the function prog$tick() d <- data[indices] Sys.sleep(0.01) mean(d, na.rm = T) } tot_rep <- 200 #initialize progress bar object pb <- progress_bar$new(total = tot_rep + 1) #perform bootstrap boot(data = v1, statistic = m.boot, R = tot_rep, prog = pb) 的迭代次数设置为bootstrap复制总数(参数progress_bar)的+1,但这是在我自己的代码,否则会引发错误。引导程序函数运行的时间似乎比参数R中指定的运行时间多,因此,如果将进度条设置为仅运行R次,它会认为作业实际上是在完成之前完成的。