更快的最后观察结果(LOCF)

时间:2017-04-11 02:37:04

标签: r stata zoo tidyr

我最近需要通过id在时间上向前和向后分发12个时不变变量的值。我的数据集包含2,448,638个观测值和57个变量。

以下是可重复的讨论示例:

# Load packages
library(tidyverse)
library(zoo)
library(lubridate)
library(tidyr)

# Reproducable example
set.seed(2017)
df <- tibble(
  id       = integer(15),
  days     = integer(15),
  race     = character(15),
  language = character(15)
  ) %>% 

  mutate(
    id = rep(1:3, each = 5)
  ) %>% 

  group_by(id) %>% 

  mutate(
    days     = as.integer(c(rnorm(2, -30, 15), 0, rnorm(2, 200, 100))),
    race     = if_else(days == 0, sample(c("W", "AA", "A", "O"), 1, replace = TRUE), NA_character_),
    language = if_else(days == 0, sample(c("English", "Spanish", "Other"), 1, replace = TRUE), NA_character_)
  ) %>% 

  arrange(id, days)

df

      id  days  race language
   <int> <int> <chr>    <chr>
1      1   -31  <NA>     <NA>
2      1    -8  <NA>     <NA>
3      1     0     W  English
4      1    24  <NA>     <NA>
5      1   273  <NA>     <NA>
6      2   -31  <NA>     <NA>
7      2   -23  <NA>     <NA>
8      2     0     O  English
9      2     4  <NA>     <NA>
10     2   199  <NA>     <NA>
11     3   -33  <NA>     <NA>
12     3    -6  <NA>     <NA>
13     3     0     A  English
14     3   234  <NA>     <NA>
15     3   357  <NA>     <NA>

我想出了几种方法来获得我想要的结果:

使用zoo :: na.locf

time_invariant <- c("race", "language")

df2 <- df %>% 
  group_by(id) %>% 
  mutate_at(.vars = time_invariant, .funs = na.locf, na.rm = FALSE) %>%
  arrange(id, desc(days)) %>%
  mutate_at(.vars = time_invariant, .funs = na.locf, na.rm = FALSE) %>%
  arrange(id, days)

在使用我的2016 MB Pro的可重现示例上完成0.066293秒。

我也试过tidyr :: fill

df2 <- df %>% 
  group_by(id) %>% 
  fill_(fill_cols = time_invariant) %>% 
  fill_(fill_cols = time_invariant, .direction = "up")

在使用我的2016 MB Pro的可重复示例上完成0.04381​​585秒。

然而,在我的真实数据中,zoo :: na.locf方法耗时3.172092分钟,而tidyr :: fill方法耗时5.523152分钟。这些时间并不可怕,但我注意到它们比Stata慢得多(我的2016 MB Pro运行Stata 14.2时为9.9060秒)。这种速度差异促使我看看是否有人知道更快的方法。

1 个答案:

答案 0 :(得分:2)

非常肯定专家可以更快地做到这一点:

df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    3L, 3L, 3L, 3L, 3L), days = c(-31L, -8L, 0L, 24L, 273L, -31L, 
        -23L, 0L, 4L, 199L, -33L, -6L, 0L, 234L, 357L), race = c(NA, 
            NA, "W", NA, NA, NA, NA, "O", NA, NA, NA, NA, "A", NA, NA), language = c(NA, 
                NA, "English", NA, NA, NA, NA, "English", NA, NA, NA, NA, "English", 
                NA, NA)), class = "data.frame", row.names = c(NA, -15L), .Names = c("id", 
                    "days", "race", "language"))

library(dplyr)
library(zoo)
library(tidyr)
time_invariant <- c("race", "language")
dplyrzoo <- function() {
    df2 <- df %>% 
        group_by(id) %>% 
        mutate_at(.cols = time_invariant, .funs = na.locf, na.rm = FALSE) %>%
        arrange(id, desc(days)) %>%
        mutate_at(.cols = time_invariant, .funs = na.locf, na.rm = FALSE) %>%
        arrange(id, days)
}

dplyrfill <- function() {
    df2 <- df %>% 
        group_by(id) %>% 
        fill_(fill_cols = time_invariant) %>% 
        fill_(fill_cols = time_invariant, .direction = "up")
}

library(data.table)
dtstyle <- function() {
    dt <- data.table(df)
    cols <- c("race", "language")
    dt[, (cols) := lapply(.SD, function(x) na.omit(x)[1]), .SDcols=cols, by =.(id)]
    dt
}

#check results
all.equal(as.data.frame(dplyrzoo()), as.data.frame(dplyrfill()))
all.equal(as.data.frame(dtstyle()), as.data.frame(dplyrfill()))

#timings
library(microbenchmark)
timings <- capture.output(microbenchmark(dplyrzoo=dplyrzoo(),
    dplyrfill=dplyrfill(),
    dtstyle=dtstyle(),
    times=100L))
writeLines(paste("#", timings))

# Unit: milliseconds
#       expr    min      lq     mean  median      uq     max neval
#   dplyrzoo 6.7952 7.01815 7.399851 7.18815 7.53685 10.8360   100
#  dplyrfill 4.7458 5.02865 5.319848 5.16990 5.34750  7.8329   100
#    dtstyle 1.3598 1.54025 1.692119 1.65420 1.73280  4.0413   100