这是对this thread的跟进。像那里一样,我需要定义组列的游程类型ID(忽略NA
),并附加列seq_break
指示序列应在seq_break = TRUE
时结束。但是,该主题中提供的解决方案在seq_break = TRUE
行中开始一个新序列,而实际上应将它作为先前序列的最后一个事件包括在内。示例数据附在下面。可以在行46
中观察到这种差异-先前的解决方案将从此处的序列元素13
开始,而我需要将其包含在序列12
中。
df <- structure(list(group = c(NA, NA, "home", "home", "home", "home",
"home", "home", "away", NA, NA, "home", "home", "home", NA, NA,
NA, "home", "away", "away", NA, "away", "away", "away", "home",
"away", "away", "away", NA, "home", "home", NA, NA, "away", NA,
NA, "home", NA, NA, "home", "home", "home", "home", "home", "home",
"home", "away", "away", NA, NA), seq_break = c(FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE,
FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE,
FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE,
TRUE), expected_output = c(NA, NA, 1, 1, 1, 1, 1, 1, 2, NA, NA,
3, 3, 3, NA, NA, NA, 4, 5, 5, NA, 6, 6, 6, 7, 8, 8, 8, NA, 9,
9, NA, NA, 10, NA, NA, 11, NA, NA, 12, 12, 12, 12, 12, 12, 12,
13, 13, NA, NA)), .Names = c("group", "seq_break", "expected_output"
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-50L))
有什么想法可以用tidyverse
完成吗?我不知道如何在这里替换cumsum
...
答案 0 :(得分:1)
使用rleid并从data.table中转移...
library(data.table)
setDT(df)
# make groups
df[, v := rleid(group, shift(cumsum(seq_break)))]
# drop if group is NA
df[is.na(group), v := NA]
# renumber the others
df[!is.na(group), v := .GRP, by=v]
# check
stopifnot( df[, all.equal(v, expected_output)] )
在示例中,seq_break
列实际上是无关紧要的,因此我不确定是否正确使用了它:
df[, v2 := rleid(group)][is.na(group), v2 := NA][!is.na(group), v2 := .GRP, by=v2]
# check
stopifnot( df[, all.equal(v2, expected_output)] )
由于OP想要一个'反向答案,所以这里是一个翻译(仍然使用rleid):
library(dplyr)
res = df %>% mutate(
v2 = data.table::rleid(group) %>% replace(is.na(group), NA),
v2 = match(v2, na.omit(unique(v2)))
)
# check
stopifnot( with(res, all.equal(v2, expected_output)) )
答案 1 :(得分:0)
我们可以创建一个新列来调用seq_break2
并添加到管道中,如下所示。这样会产生与预期输出相同的结果。
library(tidyverse)
library(data.table)
df2 <- df %>%
select(-expected_output) %>%
rowid_to_column()
df3 <- df2 %>%
mutate(seq_break2 = ifelse(seq_break & !is.na(group), FALSE, seq_break)) %>%
mutate(ID = rleid(group, seq_break2)) %>%
group_by(group, seq_break2, ID) %>%
filter(!(is.na(group) & seq_break2 & row_number() > 1)) %>%
ungroup() %>%
mutate(ID2 = cumsum(seq_break2)) %>%
drop_na(group) %>%
mutate(expected_output = rleid(group, ID2)) %>%
select(rowid, expected_output) %>%
left_join(df2, ., by = "rowid") %>%
select(-rowid)