当使用`c()`来提供多个行值时,与执行单个切片相比,切片意外地变慢

时间:2016-06-23 04:45:58

标签: r dplyr

我不明白为什么将第一个和最后一个值取slice的速度比先用slice慢一倍,然后是最后一个值慢5倍。

library(dplyr)
letters <- c("A", "B", "C")
timestamps <- Sys.time() + 1:1000
integer_sizes <- 1:300

N = 700000

df <- 
    data.frame(letters = sample(letters, N, replace = TRUE),
           sizes = sample(integer_sizes, N, replace = TRUE),
           time = sample(timestamps, N, replace = TRUE),
           price = runif(N),
           stringsAsFactors = FALSE) %>%
    arrange(letters, sizes, time)

fn_first <- function(){
    df %>%
        group_by(letters, sizes, time) %>%
        slice(1)
}

fn_last <- function(){
    df %>%
        group_by(letters, sizes, time) %>%
        slice(n())
}

fn_combined <- function(){
    df %>%
        group_by(letters, sizes, time) %>%
        slice(c(1, n()))
}

给出基准测试结果:

> microbenchmark::microbenchmark(fn_first(), fn_last(), fn_combined(), times = 5)
Unit: seconds
          expr      min       lq     mean   median        uq       max neval
    fn_first() 1.038021 1.080131 1.130611 1.092370  1.136293  1.306238     5
     fn_last() 1.198582 1.236393 1.282161 1.303314  1.320716  1.351801     5
 fn_combined() 9.817495 9.916474 9.977188 9.955553 10.090996 10.105425     5

编辑:看起来这与连接运算符有关:

fn_c <- function(){
    df %>%
        group_by(letters, sizes, time) %>%
        slice(c(1))
}

fn_no_c <- function(){
    df %>%
        group_by(letters, sizes, time) %>%
        slice(1)
}

给出

> microbenchmark(fn_c(), fn_no_c(), times = 5)
Unit: seconds
      expr      min       lq     mean   median       uq      max neval
    fn_c() 8.910264 9.119980 9.222461 9.264068 9.377603 9.440387     5
 fn_no_c() 1.068043 1.075615 1.153856 1.154506 1.211585 1.259534     5

> all.equal(fn_c(), fn_no_c())
[1] TRUE

0 个答案:

没有答案