我不明白为什么将第一个和最后一个值取slice
的速度比先用slice
慢一倍,然后是最后一个值慢5倍。
library(dplyr)
letters <- c("A", "B", "C")
timestamps <- Sys.time() + 1:1000
integer_sizes <- 1:300
N = 700000
df <-
data.frame(letters = sample(letters, N, replace = TRUE),
sizes = sample(integer_sizes, N, replace = TRUE),
time = sample(timestamps, N, replace = TRUE),
price = runif(N),
stringsAsFactors = FALSE) %>%
arrange(letters, sizes, time)
fn_first <- function(){
df %>%
group_by(letters, sizes, time) %>%
slice(1)
}
fn_last <- function(){
df %>%
group_by(letters, sizes, time) %>%
slice(n())
}
fn_combined <- function(){
df %>%
group_by(letters, sizes, time) %>%
slice(c(1, n()))
}
给出基准测试结果:
> microbenchmark::microbenchmark(fn_first(), fn_last(), fn_combined(), times = 5)
Unit: seconds
expr min lq mean median uq max neval
fn_first() 1.038021 1.080131 1.130611 1.092370 1.136293 1.306238 5
fn_last() 1.198582 1.236393 1.282161 1.303314 1.320716 1.351801 5
fn_combined() 9.817495 9.916474 9.977188 9.955553 10.090996 10.105425 5
编辑:看起来这与连接运算符有关:
fn_c <- function(){
df %>%
group_by(letters, sizes, time) %>%
slice(c(1))
}
fn_no_c <- function(){
df %>%
group_by(letters, sizes, time) %>%
slice(1)
}
给出
> microbenchmark(fn_c(), fn_no_c(), times = 5)
Unit: seconds
expr min lq mean median uq max neval
fn_c() 8.910264 9.119980 9.222461 9.264068 9.377603 9.440387 5
fn_no_c() 1.068043 1.075615 1.153856 1.154506 1.211585 1.259534 5
与
> all.equal(fn_c(), fn_no_c())
[1] TRUE