特定组变量跟踪满足条件时的最后观察

时间:2019-10-11 05:43:50

标签: r data.table

我正在尝试构造一个特定于组的变量,

  1. 不适用,直到满足条件为止。
  2. 满足条件时进行第一次观察的不适用。
  3. 对于后续观察,是指满足条件时最近的先前观察的值。

以下是MWE。我在寻找构造“ last_value”的代码。我更喜欢使用data.table(或dplyr)来实现可伸缩性/性能,但愿意接受其他建议。谢谢!

library(data.table)

data.table(
    group = c(rep("alpha", 4), rep("beta", 3)),
    time = c(1:4, 1:3),
    condition = c("Yes", "No", "Yes", "No", "No", "Yes", "No"),
    value = 1:7,
    last_value = c(NA, 1, 1, 3, NA, NA, 6))

#    group time condition value last_value
# 1: alpha    1       Yes     1         NA
# 2: alpha    2        No     2          1
# 3: alpha    3       Yes     3          1
# 4: alpha    4        No     4          3
# 5:  beta    1        No     5         NA
# 6:  beta    2       Yes     6         NA
# 7:  beta    3        No     7          6

# last_value is:

#     NA in the 1st row as that is the 1st observation for group "alpha"
#     1 in the 2nd row as the 1st observation is condition = "Yes"
#     1 in the 3rd row as the 1st observation is condition = "Yes"; 2nd observation is condition = "No"
#     3 in the 4rd row as the 3rd observation is condition = "Yes"

#     NA in the 5th row as that is the 1st observation for group "beta"
#     NA in the 6th row as there is no prior observation with condition = "Yes"
#     6 in the 7th row as the 6th observation is condition = "Yes"

4 个答案:

答案 0 :(得分:2)

这是dplyr中使用cummax的另一个。

library(dplyr)
df %>%
  group_by(group) %>%
  mutate(last = cummax(row_number() * (condition == "Yes")),
         last = lag(value[replace(last, last == 0, NA)]))


#  group  time condition value last_value  last
#  <fct> <int> <fct>     <int>      <dbl> <int>
#1 alpha     1 Yes           1         NA    NA
#2 alpha     2 No            2          1     1
#3 alpha     3 Yes           3          1     1
#4 alpha     4 No            4          3     3
#5 beta      1 No            5         NA    NA
#6 beta      2 Yes           6         NA    NA
#7 beta      3 No            7          6     6

答案 1 :(得分:2)

借此机会来体验data.tablefifelse()的一些最新添加:

nafill()

或使用管道:

DT[order(time), # necessary if data is note in order
   last_value2 := nafill(shift(fifelse(condition == "Yes", value, NA_integer_)), "locf"),
   by = group
   ]

   group time condition value last_value last_value2
1: alpha    1       Yes     1         NA          NA
2: alpha    2        No     2          1           1
3: alpha    3       Yes     3          1           1
4: alpha    4        No     4          3           3
5:  beta    1        No     5         NA          NA
6:  beta    2       Yes     6         NA          NA
7:  beta    3        No     7          6           6

答案 2 :(得分:1)

这是一种dplyr方法,其中last_value中的期望输出是在last_value2中如实生成的。

library(dplyr)
library(tidyr)
df %>%
  group_by(group) %>%
  mutate(value2 = if_else(condition == "Yes", value, NA_integer_)) %>%
  tidyr::fill(value2) %>%
  mutate(last_value2 = lag(value2)) %>%
  ungroup()

## A tibble: 7 x 7
#  group  time condition value last_value value2 last_value2
#  <fct> <int> <fct>     <int>      <dbl>  <int>       <int>
#1 alpha     1 Yes           1         NA      1          NA
#2 alpha     2 No            2          1      1           1
#3 alpha     3 Yes           3          1      3           1
#4 alpha     4 No            4          3      3           3
#5 beta      1 No            5         NA     NA          NA
#6 beta      2 Yes           6         NA      6          NA
#7 beta      3 No            7          6      6           6

假设此处加载了数据:

df <- data.frame(
  group = c(rep("alpha", 4), rep("beta", 3)),
  time = c(1:4, 1:3),
  condition = c("Yes", "No", "Yes", "No", "No", "Yes", "No"),
  value = 1:7,
  last_value = c(NA, 1, 1, 3, NA, NA, 6))

答案 3 :(得分:1)

以下是使用data.table的一些选项:

1。使用非等额联接

DT[, lv := 
    DT[condition=="Yes"][.SD, on=.(group, time<time), x.value, by=.EACHI, mult="last"]$x.value
]

2。将序列分成以TRUE开头的组

DT2[, lv := if(condition[1L]) value[1L], .(group, cumsum(condition))][,
    lv := shift(lv), group]

3。滚动连接data.table(应该是最快的)

DT3[, c("lv", "t2") := .(NA_integer_, shift(time))]
DT3[group!=shift(group), t2 := NA_integer_]
DT3[, lv := DT3[(condition)][.SD, on=.(group, time=t2), roll=Inf, x.value]]

4。。在开发版本的data.table中,您应该可以使用DT[, lv2 := shift(nafill(replace(value, condition=="No", NA_integer_)), group]。注意:无法升级,因此无法测试。

数据:

library(data.table)
DT <- data.table(
    group = c(rep("alpha", 4), rep("beta", 3)),
    time = c(1:4, 1:3),
    condition = c(T, F, T, F, F, T, F),
    value = 1:7,
    last_value = c(NA, 1, 1, 3, NA, NA, 6))
DT2 <- copy(DT); DT3 <- copy(DT);

输出:

   group time condition value last_value lv
1: alpha    1      TRUE     1         NA NA
2: alpha    2     FALSE     2          1  1
3: alpha    3      TRUE     3          1  1
4: alpha    4     FALSE     4          3  3
5:  beta    1     FALSE     5         NA NA
6:  beta    2      TRUE     6         NA NA
7:  beta    3     FALSE     7          6  6

计时代码:

library(data.table)

set.seed(0L)
ng <- 1e6
nr <- 1e7
DT <- data.table(group=sort(sample(ng, nr, TRUE)))
DT[, c("time", "condition", "value") := .(rowid(group), sample(c(TRUE, FALSE), nr, TRUE), .I)]
DT2 <- copy(DT)
DT3 <- copy(DT)

mtd0 <- function() {
    DT[, lv :=
        DT[(condition)][.SD, on=.(group, time<time), x.value, by=.EACHI, mult="last"]$x.value
    ]
}

mtd1 <- function() {
    DT2[, lv := if(condition[1L]) value[1L], .(group, cumsum(condition))][,
        lv := shift(lv), group]
}

mtd2 <- function() {
    DT3[, c("lv", "t2") := .(NA_integer_, shift(time))]
    DT3[group!=shift(group), t2 := NA_integer_]
    DT3[, lv := DT3[(condition)][.SD, on=.(group, time=t2), roll=Inf, x.value]]
}

bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)

时间:

# A tibble: 3 x 13
  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result     memory   time  gc     
  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>     <list>   <lis> <list> 
1 mtd0()       5.24s   5.24s    0.191      485MB    0.191     1     1      5.24s <df[,5] [~ <df[,3]~ <bch~ <tibbl~
2 mtd1()      48.55s  48.55s    0.0206     240MB    9.19      1   446     48.55s <df[,5] [~ <df[,3]~ <bch~ <tibbl~
3 mtd2()       1.28s   1.28s    0.780      618MB    0         1     0      1.28s <df[,6] [~ <df[,3]~ <bch~ <tibbl~