我有一个看起来像这样的数据框:
test
id value timestamp
1 foo blue 2019-10-17 17:42:52
2 foo <NA> 2019-10-17 17:43:52
3 foo <NA> 2019-10-17 17:44:52
4 foo red 2019-10-17 17:45:52
5 foo <NA> 2019-10-17 17:46:52
6 bar <NA> 2019-10-17 17:47:52
7 bar green 2019-10-17 17:48:52
8 bar <NA> 2019-10-17 17:49:52
9 bar <NA> 2019-10-17 17:50:52
10 bar <NA> 2019-10-17 17:51:52
我的目标是在value
中出现非空字符串后填写NA值,例如:
output
id value timestamp
1 foo blue 2019-10-17 17:42:52
2 foo blue 2019-10-17 17:43:52
3 foo blue 2019-10-17 17:44:52
4 foo red 2019-10-17 17:45:52
5 foo red 2019-10-17 17:46:52
6 bar <NA> 2019-10-17 17:47:52
7 bar green 2019-10-17 17:48:52
8 bar green 2019-10-17 17:49:52
9 bar green 2019-10-17 17:50:52
10 bar green 2019-10-17 17:51:52
我了解如何使用lead()
和lag()
和rle()
,但是如何用R中的时间戳(和ID)从先前的已知值中填充当前值?
任何建议将不胜感激
以下是数据:
dput(test)
structure(list(id = c("foo", "foo", "foo", "foo", "foo", "bar",
"bar", "bar", "bar", "bar"), value = c("blue", NA, NA, "red",
NA, NA, "green", NA, NA, NA), timestamp = structure(c(1571348572.31003,
1571348632.31003, 1571348692.31003, 1571348752.31003, 1571348812.31003,
1571348872.31003, 1571348932.31003, 1571348992.31003, 1571349052.31003,
1571349112.31003), class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-10L), class = "data.frame")
答案 0 :(得分:2)
一些选项:
1)使用zoo::na.locf
setDT(test)[, value := zoo::na.locf(value, FALSE), id]
2)使用2019年10月3日在CRAN上发布的data.table版str.split中的新data.table::nafill
软件包(尽管目前不适用于字符向量)
setDT(test)[!is.na(value), v := .I][, v := nafill(v, "locf"), id]
test[is.na(value), value := test[!is.na(value)][.SD, on=.(id, v), x.value]]
3)切成以非NA值开头的组,并分配第一个值(可能很慢)
setDT(test)[, value := value[1L], .(id, cumsum(!is.na(value)))]
4)使用滚动连接(可能最快)
setDT(test)[, rn:=.I]
test[is.na(value), value := test[!is.na(value)][.SD, on=.(id, rn), roll=Inf, x.value]]
数据:
library(data.table) #data.table_1.12.4
test <- structure(list(id = c("foo", "foo", "foo", "foo", "foo", "bar",
"bar", "bar", "bar", "bar"), value = c("blue", NA, NA, "red",
NA, NA, "green", NA, NA, NA), timestamp = structure(c(1571348572.31003,
1571348632.31003, 1571348692.31003, 1571348752.31003, 1571348812.31003,
1571348872.31003, 1571348932.31003, 1571348992.31003, 1571349052.31003,
1571349112.31003), class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-10L), class = "data.frame")
计时代码:
library(data.table)
set.seed(0L)
nr <- 1e7
nid <- 1e5
DT <- data.table(id=sample(nid, nr, TRUE), value=sample(c("A", NA_character_), nr, TRUE))
DT1 <- copy(DT)
DT2 <- copy(DT)
DT3 <- copy(DT)
DT4 <- copy(DT)
mtd1 <- function(test) {
test[, value := zoo::na.locf(value, FALSE), id]
}
mtd2 <- function(test) {
test[!is.na(value), v := .I][, v := nafill(v, "locf"), id]
test[is.na(value), value := test[!is.na(value)][.SD, on=.(v), x.value]]
}
mtd3 <- function(test) {
test[, value := value[1L], .(id, cumsum(!is.na(value)))]
}
mtd4 <- function(test) {
test[, rn:=.I]
test[is.na(value), value := test[!is.na(value)][.SD, on=.(id, rn), roll=Inf, x.value]]
}
microbenchmark::microbenchmark(mtd1(DT1), mtd2(DT2), mtd3(DT3), mtd4(DT4), times=3L)
时间:
Unit: milliseconds
expr min lq mean median uq max neval
mtd1(DT1) 3800.3253 3840.028 3964.1050 3879.7299 4045.9948 4212.260 3
mtd2(DT2) 3795.3079 3921.329 4352.8009 4047.3509 4631.5474 5215.744 3
mtd3(DT3) 3188.2339 4236.435 4674.1033 5284.6361 5417.0380 5549.440 3
mtd4(DT4) 340.7171 481.754 734.3773 622.7909 931.2074 1239.624 3
答案 1 :(得分:1)
tidyr::fill
是您想要的。
这是我的例子:
library(dplyr); library(tidyr)
test %>%
# arrange(timestamp) %>% # In this example case, this line is unnecessary
group_by(id) %>%
fill(value, direction = c("down")) %>% # direction is default value
ungroup()