我正在尝试构造一个特定于组的变量,
以下是MWE。我在寻找构造“ last_value”的代码。我更喜欢使用data.table(或dplyr)来实现可伸缩性/性能,但愿意接受其他建议。谢谢!
library(data.table)
data.table(
group = c(rep("alpha", 4), rep("beta", 3)),
time = c(1:4, 1:3),
condition = c("Yes", "No", "Yes", "No", "No", "Yes", "No"),
value = 1:7,
last_value = c(NA, 1, 1, 3, NA, NA, 6))
# group time condition value last_value
# 1: alpha 1 Yes 1 NA
# 2: alpha 2 No 2 1
# 3: alpha 3 Yes 3 1
# 4: alpha 4 No 4 3
# 5: beta 1 No 5 NA
# 6: beta 2 Yes 6 NA
# 7: beta 3 No 7 6
# last_value is:
# NA in the 1st row as that is the 1st observation for group "alpha"
# 1 in the 2nd row as the 1st observation is condition = "Yes"
# 1 in the 3rd row as the 1st observation is condition = "Yes"; 2nd observation is condition = "No"
# 3 in the 4rd row as the 3rd observation is condition = "Yes"
# NA in the 5th row as that is the 1st observation for group "beta"
# NA in the 6th row as there is no prior observation with condition = "Yes"
# 6 in the 7th row as the 6th observation is condition = "Yes"
答案 0 :(得分:2)
这是dplyr
中使用cummax
的另一个。
library(dplyr)
df %>%
group_by(group) %>%
mutate(last = cummax(row_number() * (condition == "Yes")),
last = lag(value[replace(last, last == 0, NA)]))
# group time condition value last_value last
# <fct> <int> <fct> <int> <dbl> <int>
#1 alpha 1 Yes 1 NA NA
#2 alpha 2 No 2 1 1
#3 alpha 3 Yes 3 1 1
#4 alpha 4 No 4 3 3
#5 beta 1 No 5 NA NA
#6 beta 2 Yes 6 NA NA
#7 beta 3 No 7 6 6
答案 1 :(得分:2)
借此机会来体验data.table
和fifelse()
的一些最新添加:
nafill()
或使用管道:
DT[order(time), # necessary if data is note in order
last_value2 := nafill(shift(fifelse(condition == "Yes", value, NA_integer_)), "locf"),
by = group
]
group time condition value last_value last_value2
1: alpha 1 Yes 1 NA NA
2: alpha 2 No 2 1 1
3: alpha 3 Yes 3 1 1
4: alpha 4 No 4 3 3
5: beta 1 No 5 NA NA
6: beta 2 Yes 6 NA NA
7: beta 3 No 7 6 6
答案 2 :(得分:1)
这是一种dplyr
方法,其中last_value
中的期望输出是在last_value2
中如实生成的。
library(dplyr)
library(tidyr)
df %>%
group_by(group) %>%
mutate(value2 = if_else(condition == "Yes", value, NA_integer_)) %>%
tidyr::fill(value2) %>%
mutate(last_value2 = lag(value2)) %>%
ungroup()
## A tibble: 7 x 7
# group time condition value last_value value2 last_value2
# <fct> <int> <fct> <int> <dbl> <int> <int>
#1 alpha 1 Yes 1 NA 1 NA
#2 alpha 2 No 2 1 1 1
#3 alpha 3 Yes 3 1 3 1
#4 alpha 4 No 4 3 3 3
#5 beta 1 No 5 NA NA NA
#6 beta 2 Yes 6 NA 6 NA
#7 beta 3 No 7 6 6 6
假设此处加载了数据:
df <- data.frame(
group = c(rep("alpha", 4), rep("beta", 3)),
time = c(1:4, 1:3),
condition = c("Yes", "No", "Yes", "No", "No", "Yes", "No"),
value = 1:7,
last_value = c(NA, 1, 1, 3, NA, NA, 6))
答案 3 :(得分:1)
以下是使用data.table
的一些选项:
1。使用非等额联接
DT[, lv :=
DT[condition=="Yes"][.SD, on=.(group, time<time), x.value, by=.EACHI, mult="last"]$x.value
]
2。将序列分成以TRUE开头的组
DT2[, lv := if(condition[1L]) value[1L], .(group, cumsum(condition))][,
lv := shift(lv), group]
3。滚动连接data.table(应该是最快的)
DT3[, c("lv", "t2") := .(NA_integer_, shift(time))]
DT3[group!=shift(group), t2 := NA_integer_]
DT3[, lv := DT3[(condition)][.SD, on=.(group, time=t2), roll=Inf, x.value]]
4。。在开发版本的data.table
中,您应该可以使用DT[, lv2 := shift(nafill(replace(value, condition=="No", NA_integer_)), group]
。注意:无法升级,因此无法测试。
数据:
library(data.table)
DT <- data.table(
group = c(rep("alpha", 4), rep("beta", 3)),
time = c(1:4, 1:3),
condition = c(T, F, T, F, F, T, F),
value = 1:7,
last_value = c(NA, 1, 1, 3, NA, NA, 6))
DT2 <- copy(DT); DT3 <- copy(DT);
输出:
group time condition value last_value lv
1: alpha 1 TRUE 1 NA NA
2: alpha 2 FALSE 2 1 1
3: alpha 3 TRUE 3 1 1
4: alpha 4 FALSE 4 3 3
5: beta 1 FALSE 5 NA NA
6: beta 2 TRUE 6 NA NA
7: beta 3 FALSE 7 6 6
计时代码:
library(data.table)
set.seed(0L)
ng <- 1e6
nr <- 1e7
DT <- data.table(group=sort(sample(ng, nr, TRUE)))
DT[, c("time", "condition", "value") := .(rowid(group), sample(c(TRUE, FALSE), nr, TRUE), .I)]
DT2 <- copy(DT)
DT3 <- copy(DT)
mtd0 <- function() {
DT[, lv :=
DT[(condition)][.SD, on=.(group, time<time), x.value, by=.EACHI, mult="last"]$x.value
]
}
mtd1 <- function() {
DT2[, lv := if(condition[1L]) value[1L], .(group, cumsum(condition))][,
lv := shift(lv), group]
}
mtd2 <- function() {
DT3[, c("lv", "t2") := .(NA_integer_, shift(time))]
DT3[group!=shift(group), t2 := NA_integer_]
DT3[, lv := DT3[(condition)][.SD, on=.(group, time=t2), roll=Inf, x.value]]
}
bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)
时间:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <list>
1 mtd0() 5.24s 5.24s 0.191 485MB 0.191 1 1 5.24s <df[,5] [~ <df[,3]~ <bch~ <tibbl~
2 mtd1() 48.55s 48.55s 0.0206 240MB 9.19 1 446 48.55s <df[,5] [~ <df[,3]~ <bch~ <tibbl~
3 mtd2() 1.28s 1.28s 0.780 618MB 0 1 0 1.28s <df[,6] [~ <df[,3]~ <bch~ <tibbl~