Question

我正在尝试为学生做一个关于采样的教学示例，但是当迭代次数达到数千时（实际数据帧df有几百万行），结果太慢了。

我可以使用purr来加快速度吗？

library(tidyverse)
set.seed(1432)
df <- data.frame(v1 = sample(1:10, 100, replace=TRUE),
                 v2 = c(rep("A", 50), rep("B", 50))
)

output <- NULL

for (i in 1:10) {
  set.seed(i)
  d <- df %>%
    filter(v2=="A") %>%
    sample_n(20, replace=FALSE)

  mean = mean(d$v1)
  output <- c(output, mean)
}

output

Answer 1

您可以按以下方式使用purrr。

map_dbl(1:10, function(x){
  set.seed(x)
  d <- df %>%
    filter(v2=="A") %>%
    sample_n(20, replace=FALSE)

  return(mean(d$v1))
})
# [1] 5.15 5.90 5.70 5.55 5.60 4.95 5.40 5.40 5.65 5.40

Answer 2

purrr不一定更快，但比R中的基本控制结构更具可读性。当涉及到替换循环时，您可以在基本R中执行以下操作：

sapply(1:10, function(x){
  set.seed(x)
  d <- df %>%
    filter(v2=="A") %>%
    sample_n(20, replace=FALSE)
  mean(d$v1)
})

更新您使用dplyr和purrr不能保证您的代码会很快。 IMO，这些软件包的开发首先是为了提高代码的可读性，而不是为了加速昂贵的计算。如果仔细使用基本的R数据结构，则可以大大提高速度。 d是原始循环，a和b是函数式编程解决方案，而f是经过优化的解决方案：

a <- function(y){sapply(1:y, function(x){
  set.seed(x)
  d <- df %>%
    filter(v2=="A") %>%
    sample_n(20, replace=FALSE)
    mean(d$v1)
})}

b <- function(y) {map_dbl(1:y, function(x){
  set.seed(x)
  d <- df %>%
    filter(v2=="A") %>%
    sample_n(20, replace=FALSE)

  return(mean(d$v1))
})}

d <- function(y){
  output <- NULL
  for (i in 1:y) {
    set.seed(i)
    d <- df %>%
      filter(v2=="A") %>%
      sample_n(20, replace=FALSE)
    output <- c(output, mean(d$v1))
  }

  output
}

f <- function(y){
  output <- vector("list", y)
  for (i in 1:y) {
    set.seed(i)
    d <- df[df$v2 == "A", ]
    d <- d[sample(1:nrow(d), 20, replace = FALSE), ]

    output[[i]] <- mean(d$v1)
  }

  output
}

microbenchmark::microbenchmark(a(100),b(100),d(100), f(100))

Unit: milliseconds
   expr       min        lq      mean    median        uq       max neval
 a(100) 172.06305 187.95053 205.19531 199.84411 210.55501 306.41906   100
 b(100) 171.86030 186.18869 206.50518 196.07746 213.79044 397.87859   100
 d(100) 174.45273 191.01706 208.07125 199.12653 216.54543 365.55107   100
 f(100)  14.62159  15.80092  20.96736  19.14848  24.16181  37.54095   100

观察到f的速度几乎是d的10倍，而a，b和d的速度几乎相同。

将此循环转换为pur声？

2 个答案: