R中的后/前填充行

时间:2019-10-18 01:37:23

标签: r dplyr data.table

我有一个看起来像这样的数据框:

 test
    id value           timestamp
1  foo  blue 2019-10-17 17:42:52
2  foo  <NA> 2019-10-17 17:43:52
3  foo  <NA> 2019-10-17 17:44:52
4  foo   red 2019-10-17 17:45:52
5  foo  <NA> 2019-10-17 17:46:52
6  bar  <NA> 2019-10-17 17:47:52
7  bar green 2019-10-17 17:48:52
8  bar  <NA> 2019-10-17 17:49:52
9  bar  <NA> 2019-10-17 17:50:52
10 bar  <NA> 2019-10-17 17:51:52

我的目标是在value中出现非空字符串后填写NA值,例如:

 output
    id value           timestamp
1  foo  blue 2019-10-17 17:42:52
2  foo  blue 2019-10-17 17:43:52
3  foo  blue 2019-10-17 17:44:52
4  foo   red 2019-10-17 17:45:52
5  foo   red 2019-10-17 17:46:52
6  bar  <NA> 2019-10-17 17:47:52
7  bar green 2019-10-17 17:48:52
8  bar green 2019-10-17 17:49:52
9  bar green 2019-10-17 17:50:52
10 bar green 2019-10-17 17:51:52

我了解如何使用lead()lag()rle(),但是如何用R中的时间戳(和ID)从先前的已知值中填充当前值?

任何建议将不胜感激

以下是数据:

 dput(test)
structure(list(id = c("foo", "foo", "foo", "foo", "foo", "bar", 
"bar", "bar", "bar", "bar"), value = c("blue", NA, NA, "red", 
NA, NA, "green", NA, NA, NA), timestamp = structure(c(1571348572.31003, 
1571348632.31003, 1571348692.31003, 1571348752.31003, 1571348812.31003, 
1571348872.31003, 1571348932.31003, 1571348992.31003, 1571349052.31003, 
1571349112.31003), class = c("POSIXct", "POSIXt"))), row.names = c(NA, 
-10L), class = "data.frame")

2 个答案:

答案 0 :(得分:2)

一些选项:

1)使用zoo::na.locf

setDT(test)[, value := zoo::na.locf(value, FALSE), id]

2)使用2019年10月3日在CRAN上发布的str.split中的新data.table::nafill软件包(尽管目前不适用于字符向量)

setDT(test)[!is.na(value), v := .I][, v := nafill(v, "locf"), id]
test[is.na(value), value := test[!is.na(value)][.SD, on=.(id, v), x.value]]

3)切成以非NA值开头的组,并分配第一个值(可能很慢)

setDT(test)[, value := value[1L], .(id, cumsum(!is.na(value)))]

4)使用滚动连接(可能最快)

setDT(test)[, rn:=.I]
test[is.na(value), value := test[!is.na(value)][.SD, on=.(id, rn), roll=Inf, x.value]]

数据:

library(data.table) #data.table_1.12.4
test <- structure(list(id = c("foo", "foo", "foo", "foo", "foo", "bar", 
        "bar", "bar", "bar", "bar"), value = c("blue", NA, NA, "red", 
            NA, NA, "green", NA, NA, NA), timestamp = structure(c(1571348572.31003, 
                1571348632.31003, 1571348692.31003, 1571348752.31003, 1571348812.31003, 
                1571348872.31003, 1571348932.31003, 1571348992.31003, 1571349052.31003, 
                1571349112.31003), class = c("POSIXct", "POSIXt"))), row.names = c(NA, 
                    -10L), class = "data.frame")

计时代码:

library(data.table)
set.seed(0L)
nr <- 1e7
nid <- 1e5
DT <- data.table(id=sample(nid, nr, TRUE), value=sample(c("A", NA_character_), nr, TRUE))
DT1 <- copy(DT)
DT2 <- copy(DT)
DT3 <- copy(DT)
DT4 <- copy(DT)

mtd1 <- function(test) {
    test[, value := zoo::na.locf(value, FALSE), id]
}

mtd2 <- function(test) {
    test[!is.na(value), v := .I][, v := nafill(v, "locf"), id]
    test[is.na(value), value := test[!is.na(value)][.SD, on=.(v), x.value]]
}

mtd3 <- function(test) {
    test[, value := value[1L], .(id, cumsum(!is.na(value)))]
}

mtd4 <- function(test) {
    test[, rn:=.I]
    test[is.na(value), value := test[!is.na(value)][.SD, on=.(id, rn), roll=Inf, x.value]]
}

microbenchmark::microbenchmark(mtd1(DT1), mtd2(DT2), mtd3(DT3), mtd4(DT4), times=3L)

时间:

Unit: milliseconds
      expr       min       lq      mean    median        uq      max neval
 mtd1(DT1) 3800.3253 3840.028 3964.1050 3879.7299 4045.9948 4212.260     3
 mtd2(DT2) 3795.3079 3921.329 4352.8009 4047.3509 4631.5474 5215.744     3
 mtd3(DT3) 3188.2339 4236.435 4674.1033 5284.6361 5417.0380 5549.440     3
 mtd4(DT4)  340.7171  481.754  734.3773  622.7909  931.2074 1239.624     3

答案 1 :(得分:1)

tidyr::fill是您想要的。

这是我的例子:

library(dplyr); library(tidyr)

test %>% 
  # arrange(timestamp) %>%   # In this example case, this line is unnecessary 
  group_by(id) %>% 
  fill(value, direction = c("down")) %>%    # direction is default value
  ungroup()