Question

我正在尝试在R中的引导函数中添加进度条。我试图使示例函数尽可能简单（因此我在本例中使用mean）。

library(boot)
v1 <- rnorm(1000)
rep_count = 1

m.boot <- function(data, indices) {
  d <- data[indices]
  setWinProgressBar(pb, rep_count)
  rep_count <- rep_count + 1
  Sys.sleep(0.01)
  mean(d, na.rm = T) 
  }

tot_rep <- 200
pb <- winProgressBar(title = "Bootstrap in progress", label = "",
                     min = 0, max = tot_rep, initial = 0, width = 300)
b <- boot(v1, m.boot, R = tot_rep)
close(pb)

引导程序正常运行，但问题是rep_count的值在循环中没有增加，进度条在此过程中保持冻结状态。

如果我在bootstrap完成后检查rep_count的值，它仍然是1.

我做错了什么？也许启动函数不是简单地在循环中插入m.boot函数，因此其中的变量不会增加？

谢谢。

Answer 1

增加的rep_count是一个局部变量，在每次函数调用后丢失。在下一次迭代中，函数再次从全局环境中获取rep_count，即其值为1.

您可以使用<<-：

rep_count <<- rep_count + 1

这将分配给在函数外部的搜索路径上首次找到的rep_count。当然，通常不推荐使用<<-，因为应避免使用函数的副作用，但这里有一个合法的用例。但是，您应该将整个事物包装在一个函数中，以避免对全局环境产生副作用。

可能有更好的解决方案......

Answer 2

pbapply 包旨在用于矢量化函数。在这个问题的上下文中有两种方法可以实现：（1）按照建议写一个包装器，它不会产生类'boot'的相同对象; （2）或者，行lapply(seq_len(RR), fn)可以写为pblapply(seq_len(RR), fn)。选项2可以通过本地复制/更新boot函数来实现，如下例所示，或者询问软件包维护者Brian Ripley是否考虑直接或通过 pbapply添加进度条作为依赖。

我的解决方案（评论指出的变化）：

library(boot) library(pbapply) boot2 <- function (data, statistic, R, sim = "ordinary", stype = c("i", "f", "w"), strata = rep(1, n), L = NULL, m = 0, weights = NULL, ran.gen = function(d, p) d, mle = NULL, simple = FALSE, ..., parallel = c("no", "multicore", "snow"), ncpus = getOption("boot.ncpus", 1L), cl = NULL) { call <- match.call() stype <- match.arg(stype) if (missing(parallel)) parallel <- getOption("boot.parallel", "no") parallel <- match.arg(parallel) have_mc <- have_snow <- FALSE if (parallel != "no" && ncpus > 1L) { if (parallel == "multicore") have_mc <- .Platform$OS.type != "windows" else if (parallel == "snow") have_snow <- TRUE if (!have_mc && !have_snow) ncpus <- 1L loadNamespace("parallel") } if (simple && (sim != "ordinary" || stype != "i" || sum(m))) { warning("'simple=TRUE' is only valid for 'sim=\"ordinary\", stype=\"i\", n=0', so ignored") simple <- FALSE } if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) runif(1) seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE) n <- NROW(data) if ((n == 0) || is.null(n)) stop("no data in call to 'boot'") temp.str <- strata strata <- tapply(seq_len(n), as.numeric(strata)) t0 <- if (sim != "parametric") { if ((sim == "antithetic") && is.null(L)) L <- empinf(data = data, statistic = statistic, stype = stype, strata = strata, ...) if (sim != "ordinary") m <- 0 else if (any(m < 0)) stop("negative value of 'm' supplied") if ((length(m) != 1L) && (length(m) != length(table(strata)))) stop("length of 'm' incompatible with 'strata'") if ((sim == "ordinary") || (sim == "balanced")) { if (isMatrix(weights) && (nrow(weights) != length(R))) stop("dimensions of 'R' and 'weights' do not match") } else weights <- NULL if (!is.null(weights)) weights <- t(apply(matrix(weights, n, length(R), byrow = TRUE), 2L, normalize, strata)) if (!simple) i <- index.array(n, R, sim, strata, m, L, weights) original <- if (stype == "f") rep(1, n) else if (stype == "w") { ns <- tabulate(strata)[strata] 1/ns } else seq_len(n) t0 <- if (sum(m) > 0L) statistic(data, original, rep(1, sum(m)), ...) else statistic(data, original, ...) rm(original) t0 } else statistic(data, ...) pred.i <- NULL fn <- if (sim == "parametric") { ran.gen data mle function(r) { dd <- ran.gen(data, mle) statistic(dd, ...) } } else { if (!simple && ncol(i) > n) { pred.i <- as.matrix(i[, (n + 1L):ncol(i)]) i <- i[, seq_len(n)] } if (stype %in% c("f", "w")) { f <- freq.array(i) rm(i) if (stype == "w") f <- f/ns if (sum(m) == 0L) function(r) statistic(data, f[r, ], ...) else function(r) statistic(data, f[r, ], pred.i[r, ], ...) } else if (sum(m) > 0L) function(r) statistic(data, i[r, ], pred.i[r, ], ...) else if (simple) function(r) statistic(data, index.array(n, 1, sim, strata, m, L, weights), ...) else function(r) statistic(data, i[r, ], ...) } RR <- sum(R) res <- if (ncpus > 1L && (have_mc || have_snow)) { if (have_mc) { parallel::mclapply(seq_len(RR), fn, mc.cores = ncpus) } else if (have_snow) { list(...) if (is.null(cl)) { cl <- parallel::makePSOCKcluster(rep("localhost", ncpus)) if (RNGkind()[1L] == "L'Ecuyer-CMRG") parallel::clusterSetRNGStream(cl) res <- parallel::parLapply(cl, seq_len(RR), fn) parallel::stopCluster(cl) res } else parallel::parLapply(cl, seq_len(RR), fn) } } else pblapply(seq_len(RR), fn) #### changed !!! t.star <- matrix(, RR, length(t0)) for (r in seq_len(RR)) t.star[r, ] <- res[[r]] if (is.null(weights)) weights <- 1/tabulate(strata)[strata] boot.return(sim, t0, t.star, temp.str, R, data, statistic, stype, call, seed, L, m, pred.i, weights, ran.gen, mle) } ## Functions not exported by boot isMatrix <- boot:::isMatrix index.array <- boot:::index.array boot.return <- boot:::boot.return ## Now the example m.boot <- function(data, indices) { d <- data[indices] mean(d, na.rm = T) } tot_rep <- 200 v1 <- rnorm(1000) b <- boot2(v1, m.boot, R = tot_rep)

Answer 3

我想我找到了一个可能的解决方案。这将@Roland的答案与pbapply包的便利性合并，使用其函数startpb()，closepb()等。

library(boot)
library(pbapply)

v1 <- rnorm(1000)
rep_count = 1
tot_rep = 200

m.boot <- function(data, indices) {
  d <- data[indices]
  setpb(pb, rep_count)
  rep_count <<- rep_count + 1
  Sys.sleep(0.01)                #Just to slow down the process
  mean(d, na.rm = T) 
}

pb <- startpb(min = 0, max = tot_rep)
b <- boot(v1, m.boot, R = tot_rep)
closepb(pb)
rep_count = 1

如前所述，将函数中的所有内容包装起来可以避免弄乱rep_count变量。

Answer 4

包dplyr的进度条效果很好：

library(dplyr)
library(boot)

v1 <- rnorm(1000)

m.boot <- function(data, indices) {
  d <- data[indices]
  p$tick()$print()  # update progress bar
  Sys.sleep(0.01)
  mean(d, na.rm = T) 
}

tot_rep <- 200
p <- progress_estimated(tot_rep+1)  # init progress bar
b <- boot(v1, m.boot, R = tot_rep)

Answer 5

您可以使用包pbapply

library(boot)
library(pbapply)
v1 <- rnorm(1000)
rep_count = 1

# your m.boot function ....
m.boot <- function(data, indices) {
                                   d <- data[indices]
                                   mean(d, na.rm = T) 
                                   }

# ... wraped in `bootfunc`
bootfunc <- function(x) { boot(x, m.boot, R = 200) }

# apply function to v1 , returning progress bar
pblapply(v1, bootfunc)

# > b <- pblapply(v1, bootfunc)
# >   |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% Elapsed time: 02s

Answer 6

您可以按以下方式使用软件包progress：

prior_task = prior_task >> PythonOperator

我还没有弄清楚为什么需要将library(boot) library(progress) v1 <- rnorm(1000) #add progress bar as parameter to function m.boot <- function(data, indices, prog) { #display progress with each run of the function prog$tick() d <- data[indices] Sys.sleep(0.01) mean(d, na.rm = T) } tot_rep <- 200 #initialize progress bar object pb <- progress_bar$new(total = tot_rep + 1) #perform bootstrap boot(data = v1, statistic = m.boot, R = tot_rep, prog = pb)的迭代次数设置为bootstrap复制总数（参数progress_bar）的+1，但这是在我自己的代码，否则会引发错误。引导程序函数运行的时间似乎比参数R中指定的运行时间多，因此，如果将进度条设置为仅运行R次，它会认为作业实际上是在完成之前完成的。

在R中为引导功能添加进度条

6 个答案: