我有每个用户的观察列表;每个用户每天可能有foo
的几次观察。对于每个不同的日子,我想要foo
值的累积计数。这是我到目前为止所得到的:
library(tidyverse)
library(lubridate)
df = tribble(
~user_id, ~foo, ~bar, ~created_at,
1, "a", "b", "2018-07-30",
1, "a", "c", "2018-07-31",
1, "a", "c", "2018-07-31",
1, "b", "a", "2018-08-01",
1, "b", "c", "2018-08-02",
1, "b", "a", "2018-08-03",
1, "a", "a", "2018-08-03",
2, "b", "b", "2018-07-30",
2, "b", "c", "2018-07-31",
2, "a", "a", "2018-08-01",
2, "a", "a", "2018-08-01",
2, "a", "c", "2018-08-02",
2, "a", "c", "2018-08-02",
2, "a", "a", "2018-08-03"
) %>% mutate_at("created_at", as_datetime)
df %>%
mutate(cutoff_date = created_at %>% date) %>%
group_by(user_id, foo, cutoff_date) %>%
tally %>%
mutate(foo_cnt = cumsum(n)) %>%
select(-n) %>%
arrange(user_id, cutoff_date, foo)
这给了我
user_id foo cutoff_date foo_cnt
<dbl> <chr> <date> <int>
1 1. a 2018-07-30 1
2 1. a 2018-07-31 3
3 1. b 2018-08-01 1
4 1. b 2018-08-02 2
5 1. a 2018-08-03 4
6 1. b 2018-08-03 3
7 2. b 2018-07-30 1
8 2. b 2018-07-31 2
9 2. a 2018-08-01 2
10 2. a 2018-08-02 4
11 2. a 2018-08-03 5
太好了,因此我知道,直到8月3日,用户1见过a
的四次,b
的三遍。我现在想知道,对于数据中出现的每个日期(我不在乎丢失的日期):
foo
观察到该日期为止的总数也就是说,输出应为:
user_id cutoff_date foo foo_cnt foo_cnt_total foo_pct
1 1. 2018-07-30 a 1 1 100
2 1. 2018-07-30 b 0 0 0
3 1. 2018-07-31 a 3 4 100
4 1. 2018-07-31 b 0 0 0
5 1. 2018-08-01 a 3 7 87.5
6 1. 2018-08-01 b 1 1 12.5
...
在第5行中,该百分比为87.5%,因为用户在此之前已经看过a
次七次,b
次了。
我对如何到达目的地有一个想法,但是我正在努力为数据中存在的日期添加foo
的其他值,但是没有观察到foo
。我已经研究过complete()
,但无法弄清楚如何使用它来填充剩余的值。
例如,当我添加其中任何一个时,都不会获得其他列:
complete(nesting(user_id, foo), cutoff_date)
complete(user_id, cutoff_date, foo)
我想念什么?
更新:我已经按照建议添加了ungroup
,现在我也获得了每天的总数。我已经使用fill
来填充先前的值,以得到与foo
相同的值:
df %>%
mutate(cutoff_date = created_at %>% date) %>%
group_by(user_id, foo, cutoff_date) %>%
tally %>%
mutate(foo_cnt = cumsum(n)) %>%
select(-n) %>%
ungroup() %>%
complete(nesting(user_id, foo), cutoff_date) %>%
arrange(user_id, cutoff_date, foo) %>%
group_by(user_id, foo) %>%
fill(foo_cnt) %>%
ungroup() %>%
group_by(user_id, cutoff_date) %>%
mutate(foo_cnt_total = sum(foo_cnt, na.rm = TRUE))
user_id foo cutoff_date foo_cnt foo_cnt_total
<dbl> <chr> <date> <int> <int>
1 1. a 2018-07-30 1 1
2 1. a 2018-07-31 3 3
3 1. a 2018-08-01 3 4
4 1. a 2018-08-02 3 5
5 1. a 2018-08-03 4 7
6 1. b 2018-07-30 NA 1
7 1. b 2018-07-31 NA 3
8 1. b 2018-08-01 1 4
9 1. b 2018-08-02 2 5
10 1. b 2018-08-03 3 7
但是,b
的值不应以NA
开头。这里需要什么?
答案 0 :(得分:1)
您可以指定complete
调用中使用的填充,并稍稍重新排列不同步骤的顺序,以获得所需的输出:
df %>%
mutate(cutoff_date = date(created_at)) %>%
count(user_id, foo, cutoff_date) %>%
complete(nesting(user_id, foo), cutoff_date, fill = list(n = 0)) %>%
arrange(user_id, foo, cutoff_date) %>%
group_by(user_id, foo) %>%
mutate(foo_cnt = cumsum(n)) %>%
group_by(user_id, cutoff_date) %>%
mutate(foo_cnt_total = sum(foo_cnt),
foo_pct = 100 * foo_cnt / foo_cnt_total) %>%
select(-n)
# A tibble: 20 x 6
# Groups: user_id, cutoff_date [10]
# user_id foo cutoff_date foo_cnt foo_cnt_total foo_pct
# <dbl> <chr> <date> <dbl> <dbl> <dbl>
# 1 1 a 2018-07-30 1 1 100
# 2 1 a 2018-07-31 3 3 100
# 3 1 a 2018-08-01 3 4 75
# 4 1 a 2018-08-02 3 5 60
# 5 1 a 2018-08-03 4 7 57.1
# 6 1 b 2018-07-30 0 1 0
# 7 1 b 2018-07-31 0 3 0
# 8 1 b 2018-08-01 1 4 25
# 9 1 b 2018-08-02 2 5 40
# 10 1 b 2018-08-03 3 7 42.9
答案 1 :(得分:1)
df = tribble(
~user_id, ~foo, ~bar, ~created_at,
1, "a", "b", "2018-07-30",
1, "a", "c", "2018-07-31",
1, "a", "c", "2018-07-31",
1, "b", "a", "2018-08-01",
1, "b", "c", "2018-08-02",
1, "b", "a", "2018-08-03",
1, "a", "a", "2018-08-03",
2, "b", "b", "2018-07-30",
2, "b", "c", "2018-07-31",
2, "a", "a", "2018-08-01",
2, "a", "a", "2018-08-01",
2, "a", "c", "2018-08-02",
2, "a", "c", "2018-08-02",
2, "a", "a", "2018-08-03"
) %>% mutate_at("created_at", as_datetime)
df %>%
dplyr::mutate(cutoff_date = created_at %>% date) %>%
group_by(user_id, foo, cutoff_date) %>%
tally %>%
dplyr::mutate(foo_cnt = cumsum(n)) %>%
select(-n) %>%
arrange(user_id, cutoff_date, foo) %>% group_by(user_id) %>%
complete(nesting(user_id, foo), cutoff_date, fill = list(foo_cnt = 0)) %>%
arrange(user_id, cutoff_date, foo) %>% group_by(user_id, foo) %>%
dplyr::mutate(foo_cnt_total = cumsum(foo_cnt)) %>% group_by(user_id, cutoff_date) %>%
dplyr::mutate(foo_sum_del = sum(foo_cnt_total)) %>% group_by(user_id, foo, cutoff_date) %>%
dplyr::mutate(foo_pct = foo_cnt_total/foo_sum_del*100) %>% ungroup() %>%
select(-foo_sum_del)
结果:
# A tibble: 20 x 6
user_id foo cutoff_date foo_cnt foo_cnt_total foo_pct
<dbl> <chr> <date> <dbl> <dbl> <dbl>
1 1 a 2018-07-30 1 1 100
2 1 b 2018-07-30 0 0 0
3 1 a 2018-07-31 3 4 100
4 1 b 2018-07-31 0 0 0
5 1 a 2018-08-01 0 4 80
6 1 b 2018-08-01 1 1 20
7 1 a 2018-08-02 0 4 57.1
8 1 b 2018-08-02 2 3 42.9
9 1 a 2018-08-03 4 8 57.1
10 1 b 2018-08-03 3 6 42.9
11 2 a 2018-07-30 0 0 0
12 2 b 2018-07-30 1 1 100
13 2 a 2018-07-31 0 0 0
14 2 b 2018-07-31 2 3 100
15 2 a 2018-08-01 2 2 40
16 2 b 2018-08-01 0 3 60
17 2 a 2018-08-02 4 6 66.7
18 2 b 2018-08-02 0 3 33.3
19 2 a 2018-08-03 5 11 78.6
20 2 b 2018-08-03 0 3 21.4