我最近需要通过id在时间上向前和向后分发12个时不变变量的值。我的数据集包含2,448,638个观测值和57个变量。
以下是可重复的讨论示例:
# Load packages
library(tidyverse)
library(zoo)
library(lubridate)
library(tidyr)
# Reproducable example
set.seed(2017)
df <- tibble(
id = integer(15),
days = integer(15),
race = character(15),
language = character(15)
) %>%
mutate(
id = rep(1:3, each = 5)
) %>%
group_by(id) %>%
mutate(
days = as.integer(c(rnorm(2, -30, 15), 0, rnorm(2, 200, 100))),
race = if_else(days == 0, sample(c("W", "AA", "A", "O"), 1, replace = TRUE), NA_character_),
language = if_else(days == 0, sample(c("English", "Spanish", "Other"), 1, replace = TRUE), NA_character_)
) %>%
arrange(id, days)
df
id days race language
<int> <int> <chr> <chr>
1 1 -31 <NA> <NA>
2 1 -8 <NA> <NA>
3 1 0 W English
4 1 24 <NA> <NA>
5 1 273 <NA> <NA>
6 2 -31 <NA> <NA>
7 2 -23 <NA> <NA>
8 2 0 O English
9 2 4 <NA> <NA>
10 2 199 <NA> <NA>
11 3 -33 <NA> <NA>
12 3 -6 <NA> <NA>
13 3 0 A English
14 3 234 <NA> <NA>
15 3 357 <NA> <NA>
我想出了几种方法来获得我想要的结果:
使用zoo :: na.locf
time_invariant <- c("race", "language")
df2 <- df %>%
group_by(id) %>%
mutate_at(.vars = time_invariant, .funs = na.locf, na.rm = FALSE) %>%
arrange(id, desc(days)) %>%
mutate_at(.vars = time_invariant, .funs = na.locf, na.rm = FALSE) %>%
arrange(id, days)
在使用我的2016 MB Pro的可重现示例上完成0.066293秒。
我也试过tidyr :: fill
df2 <- df %>%
group_by(id) %>%
fill_(fill_cols = time_invariant) %>%
fill_(fill_cols = time_invariant, .direction = "up")
在使用我的2016 MB Pro的可重复示例上完成0.04381585秒。
然而,在我的真实数据中,zoo :: na.locf方法耗时3.172092分钟,而tidyr :: fill方法耗时5.523152分钟。这些时间并不可怕,但我注意到它们比Stata慢得多(我的2016 MB Pro运行Stata 14.2时为9.9060秒)。这种速度差异促使我看看是否有人知道更快的方法。
答案 0 :(得分:2)
非常肯定专家可以更快地做到这一点:
df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), days = c(-31L, -8L, 0L, 24L, 273L, -31L,
-23L, 0L, 4L, 199L, -33L, -6L, 0L, 234L, 357L), race = c(NA,
NA, "W", NA, NA, NA, NA, "O", NA, NA, NA, NA, "A", NA, NA), language = c(NA,
NA, "English", NA, NA, NA, NA, "English", NA, NA, NA, NA, "English",
NA, NA)), class = "data.frame", row.names = c(NA, -15L), .Names = c("id",
"days", "race", "language"))
library(dplyr)
library(zoo)
library(tidyr)
time_invariant <- c("race", "language")
dplyrzoo <- function() {
df2 <- df %>%
group_by(id) %>%
mutate_at(.cols = time_invariant, .funs = na.locf, na.rm = FALSE) %>%
arrange(id, desc(days)) %>%
mutate_at(.cols = time_invariant, .funs = na.locf, na.rm = FALSE) %>%
arrange(id, days)
}
dplyrfill <- function() {
df2 <- df %>%
group_by(id) %>%
fill_(fill_cols = time_invariant) %>%
fill_(fill_cols = time_invariant, .direction = "up")
}
library(data.table)
dtstyle <- function() {
dt <- data.table(df)
cols <- c("race", "language")
dt[, (cols) := lapply(.SD, function(x) na.omit(x)[1]), .SDcols=cols, by =.(id)]
dt
}
#check results
all.equal(as.data.frame(dplyrzoo()), as.data.frame(dplyrfill()))
all.equal(as.data.frame(dtstyle()), as.data.frame(dplyrfill()))
#timings
library(microbenchmark)
timings <- capture.output(microbenchmark(dplyrzoo=dplyrzoo(),
dplyrfill=dplyrfill(),
dtstyle=dtstyle(),
times=100L))
writeLines(paste("#", timings))
# Unit: milliseconds
# expr min lq mean median uq max neval
# dplyrzoo 6.7952 7.01815 7.399851 7.18815 7.53685 10.8360 100
# dplyrfill 4.7458 5.02865 5.319848 5.16990 5.34750 7.8329 100
# dtstyle 1.3598 1.54025 1.692119 1.65420 1.73280 4.0413 100