让我们看一下dplyr
风格的以下示例。
# 1. Data set
df <- data.table(
g1 = c(1, 1, 2, 1, 2, 2, 1),
g2 = c(2, 1, 3, 3, 1, 1, 2),
status = c(1, 0, 1, 0, 0, 1, 1),
date_obs = as.Date(c("2019-01-01", "2019-01-02", "2019-01-12", "2019-01-15",
"2019-01-20", "2019-01-24", "2019-01-30")))
# 2. Arrange data
df <- df %>%
arrange(g1, g2, date_obs)
# 3. Populate missing 'date_obs' and 'status' values
df_filled <- df %>%
group_by(g1, g2) %>%
complete(date_obs = seq.Date(min(date_obs), max(date_obs), by = "day")) %>%
fill(status) %>%
arrange(g1, g2, date_obs) %>%
ungroup()
如何使用data.table
R语法进行相同的操作?
谢谢!
答案 0 :(得分:3)
使用滚动联接的另一个选项。
setkey(DT, g1, g2, date_obs)
out <- DT[DT[, .(date_obs = seq(first(date_obs),
last(date_obs),
by = "day")), by=.(g1, g2)],
on=.(g1, g2, date_obs),
roll = TRUE]
out
# g1 g2 status date_obs
# 1: 1 1 0 2019-01-02
# 2: 1 2 1 2019-01-01
# 3: 1 2 1 2019-01-02
# 4: 1 2 1 2019-01-03
# 5: 1 2 1 2019-01-04
# 6: 1 2 1 2019-01-05
# 7: 1 2 1 2019-01-06
# 8: 1 2 1 2019-01-07
# 9: 1 2 1 2019-01-08
#10: 1 2 1 2019-01-09
#11: 1 2 1 2019-01-10
#12: 1 2 1 2019-01-11
#13: 1 2 1 2019-01-12
#14: 1 2 1 2019-01-13
#15: 1 2 1 2019-01-14
#16: 1 2 1 2019-01-15
#17: 1 2 1 2019-01-16
#18: 1 2 1 2019-01-17
#19: 1 2 1 2019-01-18
#20: 1 2 1 2019-01-19
#21: 1 2 1 2019-01-20
#22: 1 2 1 2019-01-21
#23: 1 2 1 2019-01-22
#24: 1 2 1 2019-01-23
#25: 1 2 1 2019-01-24
#26: 1 2 1 2019-01-25
#27: 1 2 1 2019-01-26
#28: 1 2 1 2019-01-27
#29: 1 2 1 2019-01-28
#30: 1 2 1 2019-01-29
#31: 1 2 1 2019-01-30
#32: 1 3 0 2019-01-15
#33: 2 1 0 2019-01-20
#34: 2 1 0 2019-01-21
#35: 2 1 0 2019-01-22
#36: 2 1 0 2019-01-23
#37: 2 1 1 2019-01-24
#38: 2 3 1 2019-01-12
# g1 g2 status date_obs
数据
DT <- data.table(
g1 = c(1, 1, 2, 1, 2, 2, 1),
g2 = c(2, 1, 3, 3, 1, 1, 2),
status = c(1, 0, 1, 0, 0, 1, 1),
date_obs = as.Date(c("2019-01-01", "2019-01-02", "2019-01-12", "2019-01-15",
"2019-01-20", "2019-01-24", "2019-01-30")))
答案 1 :(得分:2)
该方法是通过变量g1和g2生成具有所有日期范围的“完整”数据表。然后,对原始data.table进行左连接,然后填写NA
状态。
这仍然依赖于另一个程序包来执行fill
方法。我无法tidyr::fill
上班,但zoo::na.locf
的魅力十足。填充NA值还有很多其他选择。
Replace NA with last non-NA in data.table by using only data.table
R data.table join/ subsetting/ match by group and by a condition
Replacing NAs with latest non-NA value
setorder(dt, g1, g2, date_obs)
dt_complete <- dt[, .(date_obs = seq.Date(min(date_obs), max(date_obs), by = "day")), by = .(g1, g2)]
dt[dt_complete
, on = c('date_obs', 'g1', 'g2')
, .(g1, g2, date_obs, status = zoo::na.locf(status))
]
g1 g2 date_obs status
1: 1 1 2019-01-02 0
2: 1 2 2019-01-01 1
3: 1 2 2019-01-02 1
4: 1 2 2019-01-03 1
5: 1 2 2019-01-04 1
6: 1 2 2019-01-05 1
7: 1 2 2019-01-06 1
8: 1 2 2019-01-07 1
9: 1 2 2019-01-08 1
... 38 total rows...
数据,它只是OP:
library(data.table)
library(tidyverse)
# 1. Data set
dt <- data.table(
g1 = c(1, 1, 2, 1, 2, 2, 1),
g2 = c(2, 1, 3, 3, 1, 1, 2),
status = c(1, 0, 1, 0, 0, 1, 1),
date_obs = as.Date(c("2019-01-01", "2019-01-02", "2019-01-12", "2019-01-15",
"2019-01-20", "2019-01-24", "2019-01-30")))