structure(list(group = c(17L, 17L, 17L, 18L, 18L, 18L, 18L, 19L,
19L, 19L, 20L, 20L, 20L, 21L, 21L, 22L, 23L, 24L, 25L, 25L, 25L,
26L, 27L, 27L, 27L, 28L), var = c(74L, 49L, 1L, 74L, 1L, 49L,
61L, 49L, 1L, 5L, 5L, 1L, 44L, 44L, 12L, 13L, 5L, 5L, 1L, 1L,
4L, 4L, 1L, 1L, 1L, 49L), first = c(0, 0, 1, 0, 1, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0)), .Names = c("group",
"var", "first"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-26L))
使用前两列中的数据,我想创建第三列(称为first
),其中first == 1
仅在var == 1
在组中首次出现时。换句话说,我想在group
中标记完全填充var == 1
的第一个元素。如何在dplyr
中做到这一点?当然应该使用group_by
,但是下一步呢?
答案 0 :(得分:1)
library(dplyr)
df$first = NULL
df %>%
group_by(group) %>%
mutate(first = as.numeric(row_number() == min(row_number()[var == 1]))) %>%
ungroup()
# # A tibble: 26 x 3
# group var first
# <int> <int> <dbl>
# 1 17 74 0
# 2 17 49 0
# 3 17 1 1
# 4 18 74 0
# 5 18 1 1
# 6 18 49 0
# 7 18 61 0
# 8 19 49 0
# 9 19 1 1
# 10 19 5 0
# # ... with 16 more rows
想法是标记每个组中var
= 1的最小行号。
这将返回一些警告,因为在某些组中没有var
= 1个情况。
另一个选择是:
library(dplyr)
df$first = NULL
# create row id
df$id = seq_along(df$group)
df %>%
filter(var == 1) %>% # keep cases where var = 1
distinct(group, .keep_all = T) %>% # keep distinct cases based on group
mutate(first = 1) %>% # create first column
right_join(df, by=c("id","group","var")) %>% # join back original dataset
mutate(first = coalesce(first, 0)) %>% # replace NAs with 0
select(-id) # remove row id
# # A tibble: 26 x 3
# group var first
# <int> <int> <dbl>
# 1 17 74 0
# 2 17 49 0
# 3 17 1 1
# 4 18 74 0
# 5 18 1 1
# 6 18 49 0
# 7 18 61 0
# 8 19 49 0
# 9 19 1 1
#10 19 5 0
# # ... with 16 more rows
答案 1 :(得分:1)
对于未分组的数据,一种解决方案是
first_equal_to = function(x, value)
(x == value) & (cumsum(x == value) == 1)
如此
tbl %>% group_by(group) %>% mutate(first = first_equal_to(var, 1))
(将其保留为逻辑向量似乎很合适,因为这就是该列所代表的意思)。
另一个实现是
first_equal_to2 = function(x, value) {
result = logical(length(x))
result[match(value, x)] = TRUE
result
}
答案 2 :(得分:0)
我们可以使用first
所示的表达式:
DF %>%
group_by(group) %>%
mutate(first = { var == 1 } %>% { . * !duplicated(.) } ) %>%
ungroup
给予:
# A tibble: 26 x 3
group var first
<int> <int> <int>
1 17 74 0
2 17 49 0
3 17 1 1
4 18 74 0
5 18 1 1
6 18 49 0
7 18 61 0
8 19 49 0
9 19 1 1
10 19 5 0
# ... with 16 more rows