Question

我想使用dplyr而不是apply,1来根据逻辑表达式逐行过滤数据集，即对于此示例，我想删除具有一个或多个值99的所有行。但是，我对dplyr表现不佳感到惊讶。如果我能在dplyr中加快速度，那还有什么想法吗？另外，我原本认为rowwise函数会管道各行，但显然不是（见下文）。如何使用rowwise函数？

library(tidyverse)
s <- tibble(rows = seq(from = 250, to = 5000, by = 250)) #my original dataset has 400K rows...
s$num <- map(s$rows,  ~ rnorm(.x * 6))
s$num <-
  map(s$num,  ~ replace(.x, sample(1:length(.x), size = length(.x) / 20), 99))
s$mat <- map(s$num,  ~ as_data_frame(matrix(.x, ncol = 6)))

help_an <- function(vec) {
  browser()
  return(!any(vec == 99))
}

help_dp_t <- function(df) {
  clo1 <- proc.time()
  a <- as_data_frame(t(df)) %>% summarise_all(help_an)
  df2 <- filter(df, t(a)[, 1])
  b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
  return(b)
}

s$dplyr <- map(s$mat,  ~ dplyr::mutate(help_dp_t(.x)))

help_lap <- function(df) {
  clo1 <- proc.time()
  a_base <- df[apply(df, 1, function(x)
    ! any(x == 99)), ]
  b <- tibble(time = (proc.time() - clo1)[3], df = list(a_base))
  return(b)
}

s$lapply <- map(s$mat,  ~ mutate(help_lap(.x)))
s$equal_dplyr_lapply <-
  map2_lgl(s$dplyr, s$lapply,  ~ all.equal(.x$df, .y$df))

s$dplyr_time <- map_dbl(s$dplyr, "time")
s$lapply_time <- map_dbl(s$lapply, "time")

ggplot(gather(s, ... = c(7, 8)), aes(x = rows, y = value, color = key)) +
  geom_line()

我用rowwise尝试了以下操作，但是rowwise管道不发送向量，而是将整个df发送到help_an函数。

help_dp_r <- function(df) {
      clo1 <- proc.time()
      df2 <-
        df %>% rowwise() %>% mutate(cond = help_an(.)) ### . is not passed on as a vector, but the entire df??
      b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
    }

    s$dplyr_r <- map(s$mat,  ~ dplyr::mutate(help_dp_r(.x)))

dplyr中的逐行过滤

0 个答案: