Question

让我们考虑以下编号天的时间序列：

test=data.table( day=sample(1:9, 15, TRUE), name=sort(rep(c("a", "b", "c"), 5)), value=sample(1:3, 15, TRUE) )
test[test[, !duplicated(day), by=name][,V1]][order(name, -day)]
    day name value
 1:   7    a     3
 2:   4    a     2
 3:   2    a     2
 4:   1    a     2
 5:   9    b     1
 6:   8    b     3
 7:   6    b     3
 8:   5    b     2
 9:   3    b     3
10:   7    c     1
11:   6    c     1
12:   4    c     1
13:   3    c     3
14:   1    c     2

如您所见，我们在9天内对三个物体a, b and c进行了一些测量。我们希望在三个对象之间进行日常value比较，遗憾的是一些日期随机丢失，这会导致运行算法的问题，否则会很简单。

我想将行注入此数据表中，以便所有对象具有相同的日期。注入的行会将value默认为0

所有对象的所有可用日期都列在：

> sort(unique(test[,day]) )
[1] 1 2 3 4 5 6 7 8 9

因此，例如，对象a缺少几天：3, 5, 6, 8, 9

在行注入之后，a的数据表看起来像：

test[name=="a"]
   day name value
1:   1    a     2
2:   2    a     1
3:   3    a     0
4:   4    a     3
5:   5    a     0
6:   6    a     0
7:   7    a     3
8:   8    a     0
9:   9    a     0

关于如何解决这个问题的任何想法？也许某些图书馆如lubridate已经知道如何做到这一点。

Answer 1

使用您发布的数据，我将其复制并放入data.table，您可以使用以下方式执行此操作：

library(data.table)
## create a table with all days and names
all.dates <- setDT(expand.grid(day=sort(unique(test[,day])),name=sort(unique(test[,name]))))
## perform a left-outer-join of all.dates with test
setkey(all.dates)
setkey(test,day,name)
test <- test[all.dates]
## set those NA's to zero
test[is.na(test)] <- 0
##   day name value
##1    1    a     2
##2    1    b     0
##3    1    c     2
##4    2    a     2
##5    2    b     0
##6    2    c     0
##7    3    a     0
##8    3    b     3
##9    3    c     3
##10   4    a     2
##11   4    b     0
##12   4    c     1
##13   5    a     0
##14   5    b     2
##15   5    c     0
##16   6    a     0
##17   6    b     3
##18   6    c     1
##19   7    a     3
##20   7    b     0
##21   7    c     1
##22   8    a     0
##23   8    b     3
##24   8    c     0
##25   9    a     0
##26   9    b     1
##27   9    c     0

数据：

test <- structure(list(day = c(7L, 4L, 2L, 1L, 9L, 8L, 6L, 5L, 3L, 7L, 6L, 4L, 3L, 1L), name = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), value = c(3L, 2L, 2L, 2L, 1L, 3L, 3L, 2L, 3L, 1L, 1L, 1L, 3L, 2L)), .Names = c("day", "name", "value"), class = c("data.table", "data.frame"), row.names = c(NA, -14L), .internal.selfref = <pointer: 0x102006778>) ## day name value ## 1: 7 a 3 ## 2: 4 a 2 ## 3: 2 a 2 ## 4: 1 a 2 ## 5: 9 b 1 ## 6: 8 b 3 ## 7: 6 b 3 ## 8: 5 b 2 ## 9: 3 b 3 ##10: 7 c 1 ##11: 6 c 1 ##12: 4 c 1 ##13: 3 c 3 ##14: 1 c 2

Answer 2

在tidyverse中，其中一个软件包（tidyr）包含expand.grid和left.join的包装。

library(tidyverse)
test$day <- factor(test$day, levels = 1:9)
test$name = factor(test$name, levels = c("a", "b", "c"))
test %>% 
    complete(day, name, fill = list(value = 0))
#> # A tibble: 32 Ã— 3
#>       day   name value
#>    <fctr> <fctr> <dbl>
#> 1       1      a     0
#> 2       1      b     0
#> 3       1      c     0
#> 4       2      a     0
#> 5       2      b     0
#> 6       2      c     1
#> 7       3      a     1
#> 8       3      b     0
#> 9       3      c     0
#> 10      4      a     3
#> # ... with 22 more rows

您也可以使用expand.grid和左连接。

possibilities = expand.grid(levels(test$day), unique(test$name))

possibilities %>%
    left_join(test, by = c("Var1" = "day", "Var2" = "name")) %>%
    mutate(value = ifelse(is.na(value), 0, value))
#>    Var1 Var2 value
#> 1     1    a     0
#> 2     2    a     0
#> 3     3    a     1
#> 4     4    a     3
#> 5     5    a     1

如何修复多个观察中缺少日期的时间序列？

2 个答案: