我正在尝试按变量(ID)对数据进行分组,然后根据日期创建剧集。这篇文章帮助我创建了想要的输出,但是我不知道如何为分组变量(ID)创建情节。 Locator Strategies
上面链接文章中的建议效果很好,但仅适用于一个ID。
runs <-rle(df$EpisodeTimeCriterian)$lengths
df$Episode <- rep(1:length(runs),runs)
我最喜欢使用dplyr对数据进行分组,但是当我尝试group_by然后创建Episode变量时,我收到了错误消息。
df %>%
group_by(ID)%>%
mutate(Episode = rep(1:length(runs),runs))
Error: Column `Episode` must be length 42 (the group size) or one, not 66
由于下面Ben的建议,我能够按个人ID对它们进行分组,但是现在我意识到我在两个日期之间的时间做错了。我希望如果自上次约会以来已过30天以上,则可以开始新一集。我以为我是通过计算两者之间的差异时间来完成此操作的,但是它不起作用。
我想期待一集:
# A tibble: 24 x 5
ID Date days_until_next EpisodeTimeCriterian expected
<chr> <date> <dbl> <lgl> <dbl>
1 456 2013-10-07 7 TRUE 1
2 456 2013-10-14 119 FALSE 1
3 456 2014-02-10 220 FALSE 2
4 456 2014-09-18 4 TRUE 3
5 456 2014-09-22 3 TRUE 3
6 456 2014-09-25 7 TRUE 3
7 456 2014-10-02 6 TRUE 3
8 456 2014-10-08 8 TRUE 3
9 456 2014-10-16 97 FALSE 3
10 456 2015-01-21 15 TRUE 4
11 456 2015-02-05 21 TRUE 4
12 456 2015-02-26 41 FALSE 4
13 456 2015-04-08 57 FALSE 5
14 456 2015-06-04 12 TRUE 6
15 456 2015-06-16 2 TRUE 6
16 456 2015-06-18 49 FALSE 6
17 456 2015-08-06 14 TRUE 7
18 456 2015-08-20 42 FALSE 7
19 456 2015-10-01 12 TRUE 8
20 456 2015-10-13 16 TRUE 8
21 456 2015-10-29 12 TRUE 8
22 456 2015-11-10 65 FALSE 8
23 456 2016-01-14 1 TRUE 9
24 456 2016-01-15 -830 TRUE 9
df <- original %>%
group_by(ID)%>% arrange(ID,Date)%>%
mutate(days_until_next = abs(difftime(Date,lead(Date,1),units="days")))%>%
mutate(EpisodeTimeCriterian= days_until_next <=30 | is.na(days_until_next))
runs <-rle(df$EpisodeTimeCriterian)$lengths
df$Episode <- rep(1:length(runs),runs)
df %>%
group_by(ID) %>%
mutate(
Episode2 = {
r <- rle(EpisodeTimeCriterian)
r$values <- cumsum(rep(1, length(r$values)))
inverse.rle(r)
}
) %>%
print(n=66)
df <- structure(list(ID = c("123", "123", "123", "123", "123", "123",
"123", "123", "123", "123", "123", "123", "123", "123", "123",
"123", "123", "123", "123", "123", "123", "123", "123", "123",
"123", "123", "123", "123", "123", "123", "123", "123", "123",
"123", "123", "123", "123", "123", "123", "123", "123", "123",
"456", "456", "456", "456", "456", "456", "456", "456", "456",
"456", "456", "456", "456", "456", "456", "456", "456", "456",
"456", "456", "456", "456", "456", "456"), Date = structure(c(15986,
15993, 16000, 16007, 16014, 16021, 16028, 16035, 16042, 16056,
16066, 16077, 16084, 16091, 16093, 16094, 16098, 16105, 16106,
16133, 18130, 18137, 18139, 18144, 18151, 18164, 18176, 18190,
18197, 18204, 18211, 18218, 18225, 18232, 18239, 18246, 18253,
18254, 18267, 18274, 18281, 18288, 15985, 15992, 16111, 16331,
16335, 16338, 16345, 16351, 16359, 16456, 16471, 16492, 16533,
16590, 16602, 16604, 16653, 16667, 16709, 16721, 16737, 16749,
16814, 16815), class = "Date"), days_until_next = c(7, 7, 7,
7, 7, 7, 7, 7, 14, 10, 11, 7, 7, 2, 1, 4, 7, 1, 27, 1997, 7,
2, 5, 7, 13, 12, 14, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 13, 7, 7,
7, -2302, 7, 119, 220, 4, 3, 7, 6, 8, 97, 15, 21, 41, 57, 12,
2, 49, 14, 42, 12, 16, 12, 65, 1, -830), EpisodeTimeCriterian = c(TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,
FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE,
FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE,
TRUE, TRUE)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-66L))
df %>%
structure(list(ID = c("456", "456", "456", "456", "456", "456",
"456", "456", "456", "456", "456", "456", "456", "456", "456",
"456", "456", "456", "456", "456", "456", "456", "456", "456"
), Date = structure(c(15985, 15992, 16111, 16331, 16335, 16338,
16345, 16351, 16359, 16456, 16471, 16492, 16533, 16590, 16602,
16604, 16653, 16667, 16709, 16721, 16737, 16749, 16814, 16815
), class = "Date"), days_until_next = c(7, 119, 220, 4, 3, 7,
6, 8, 97, 15, 21, 41, 57, 12, 2, 49, 14, 42, 12, 16, 12, 65,
1, -830), EpisodeTimeCriterian = c(TRUE, FALSE, FALSE, TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE,
TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE
), expected = c(1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, 6,
6, 7, 7, 8, 8, 8, 8, 9, 9)), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame"))
答案 0 :(得分:1)
这可能是一种方法。我在从初始示例创建的group_by
旁边添加了Episode2
作为Episode
的较新输出。希望这会有所帮助。
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(
Episode2 = {
r <- rle(EpisodeTimeCriterian)
r$values <- cumsum(rep(1, length(r$values)))
inverse.rle(r)
}
) %>%
print(n=66)
输出
# A tibble: 66 x 6
# Groups: ID [2]
ID Date days_until_next EpisodeTimeCriterian Episode Episode2
<chr> <date> <dbl> <lgl> <int> <dbl>
1 123 2013-10-08 7 TRUE 1 1
2 123 2013-10-15 7 TRUE 1 1
3 123 2013-10-22 7 TRUE 1 1
4 123 2013-10-29 7 TRUE 1 1
5 123 2013-11-05 7 TRUE 1 1
6 123 2013-11-12 7 TRUE 1 1
7 123 2013-11-19 7 TRUE 1 1
8 123 2013-11-26 7 TRUE 1 1
9 123 2013-12-03 14 TRUE 1 1
10 123 2013-12-17 10 TRUE 1 1
11 123 2013-12-27 11 TRUE 1 1
12 123 2014-01-07 7 TRUE 1 1
13 123 2014-01-14 7 TRUE 1 1
14 123 2014-01-21 2 TRUE 1 1
15 123 2014-01-23 1 TRUE 1 1
16 123 2014-01-24 4 TRUE 1 1
17 123 2014-01-28 7 TRUE 1 1
18 123 2014-02-04 1 TRUE 1 1
19 123 2014-02-05 27 TRUE 1 1
20 123 2014-03-04 1997 FALSE 2 2
21 123 2019-08-22 7 TRUE 3 3
22 123 2019-08-29 2 TRUE 3 3
23 123 2019-08-31 5 TRUE 3 3
24 123 2019-09-05 7 TRUE 3 3
25 123 2019-09-12 13 TRUE 3 3
26 123 2019-09-25 12 TRUE 3 3
27 123 2019-10-07 14 TRUE 3 3
28 123 2019-10-21 7 TRUE 3 3
29 123 2019-10-28 7 TRUE 3 3
30 123 2019-11-04 7 TRUE 3 3
31 123 2019-11-11 7 TRUE 3 3
32 123 2019-11-18 7 TRUE 3 3
33 123 2019-11-25 7 TRUE 3 3
34 123 2019-12-02 7 TRUE 3 3
35 123 2019-12-09 7 TRUE 3 3
36 123 2019-12-16 7 TRUE 3 3
37 123 2019-12-23 1 TRUE 3 3
38 123 2019-12-24 13 TRUE 3 3
39 123 2020-01-06 7 TRUE 3 3
40 123 2020-01-13 7 TRUE 3 3
41 123 2020-01-20 7 TRUE 3 3
42 123 2020-01-27 -2302 TRUE 3 3
43 456 2013-10-07 7 TRUE 3 1
44 456 2013-10-14 119 FALSE 4 2
45 456 2014-02-10 220 FALSE 4 2
46 456 2014-09-18 4 TRUE 5 3
47 456 2014-09-22 3 TRUE 5 3
48 456 2014-09-25 7 TRUE 5 3
49 456 2014-10-02 6 TRUE 5 3
50 456 2014-10-08 8 TRUE 5 3
51 456 2014-10-16 97 FALSE 6 4
52 456 2015-01-21 15 TRUE 7 5
53 456 2015-02-05 21 TRUE 7 5
54 456 2015-02-26 41 FALSE 8 6
55 456 2015-04-08 57 FALSE 8 6
56 456 2015-06-04 12 TRUE 9 7
57 456 2015-06-16 2 TRUE 9 7
58 456 2015-06-18 49 FALSE 10 8
59 456 2015-08-06 14 TRUE 11 9
60 456 2015-08-20 42 FALSE 12 10
61 456 2015-10-01 12 TRUE 13 11
62 456 2015-10-13 16 TRUE 13 11
63 456 2015-10-29 12 TRUE 13 11
64 456 2015-11-10 65 FALSE 14 12
65 456 2016-01-14 1 TRUE 15 13
66 456 2016-01-15 -830 TRUE 15 13
编辑(3/2/20):
我认为,如果规则是日期差> = 30天,则开始新的情节可能比以前的方法更容易。看看这是否适合您:
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(difftime = Date - lag(Date, default = first(Date)),
expected2 = cumsum(difftime >= 30) + 1) %>%
print(n=24)
输出
# A tibble: 24 x 7
# Groups: ID [1]
ID Date days_until_next EpisodeTimeCrit~ expected difftime expected2
<chr> <date> <dbl> <lgl> <dbl> <time> <dbl>
1 456 2013-10-07 7 TRUE 1 0 days 1
2 456 2013-10-14 119 FALSE 1 7 days 1
3 456 2014-02-10 220 FALSE 2 119 days 2
4 456 2014-09-18 4 TRUE 3 220 days 3
5 456 2014-09-22 3 TRUE 3 4 days 3
6 456 2014-09-25 7 TRUE 3 3 days 3
7 456 2014-10-02 6 TRUE 3 7 days 3
8 456 2014-10-08 8 TRUE 3 6 days 3
9 456 2014-10-16 97 FALSE 3 8 days 3
10 456 2015-01-21 15 TRUE 4 97 days 4
11 456 2015-02-05 21 TRUE 4 15 days 4
12 456 2015-02-26 41 FALSE 4 21 days 4
13 456 2015-04-08 57 FALSE 5 41 days 5
14 456 2015-06-04 12 TRUE 6 57 days 6
15 456 2015-06-16 2 TRUE 6 12 days 6
16 456 2015-06-18 49 FALSE 6 2 days 6
17 456 2015-08-06 14 TRUE 7 49 days 7
18 456 2015-08-20 42 FALSE 7 14 days 7
19 456 2015-10-01 12 TRUE 8 42 days 8
20 456 2015-10-13 16 TRUE 8 12 days 8
21 456 2015-10-29 12 TRUE 8 16 days 8
22 456 2015-11-10 65 FALSE 8 12 days 8
23 456 2016-01-14 1 TRUE 9 65 days 9
24 456 2016-01-15 -830 TRUE 9 1 days 9