我有一个如下所示的数据集:
> head(df)
# A tibble: 6 × 3
id tstart tstop
<dbl> <dttm> <dttm>
1 115 2016-01-04 19:14:06 2016-01-04 19:14:15
2 115 2016-01-04 19:14:15 2016-01-04 19:14:16
3 115 2016-01-04 19:14:16 2016-01-04 20:00:00
4 115 2016-01-04 20:00:00 2016-01-04 23:32:06
5 119 2016-01-09 12:56:49 2016-01-09 13:09:38
6 119 2016-01-09 19:21:30 2016-01-09 19:26:48
> dput(df)
structure(list(id = c(115, 115, 115, 115, 119, 119, 119, 119,
115, 119, 115, 115, 119, 119, 115, 115, 115, 115, 119, 115, 115,
119, 119, 115, 115, 119, 119, 119, 119, 119, 119, 119, 119, 119,
119, 115, 119, 119, 115, 119, 119, 115, 119, 115, 115, 115, 115,
115), tstart = structure(c(1451960046, 1451960055, 1451960056,
1451962800, 1452369409, 1452392490, 1452656773, 1452768075, 1453117929,
1453158614, 1453211410, 1453241664, 1453472208, 1453501656, 1453683210,
1453859618, 1453923350, 1454160212, 1454185221, 1454334295, 1454667974,
1454893810, 1455228853, 1455498598, 1455551174, 1455586503, 1455652857,
1455747333, 1455965433, 1456053421, 1456137889, 1456482398, 1456590733,
1456839351, 1456945452, 1457003430, 1457099049, 1457108703, 1457445523,
1457478749, 1457480525, 1457542159, 1457562948, 1458598425, 1458822311,
1458940977, 1459028316, 1459083563), class = c("POSIXct", "POSIXt"
), tzone = ""), tstop = structure(c(1451960055, 1451960056, 1451962800,
1451975526, 1452370178, 1452392808, 1452656986, 1452768517, 1453118186,
1453158918, 1453211770, 1453242132, 1453472619, 1453502485, 1453683500,
1453859899, 1453923567, 1454161008, 1454185580, 1454334848, 1454668930,
1454894182, 1455229448, 1455499217, 1455552432, 1455587211, 1455653538,
1455747987, 1455965658, 1456053774, 1456138469, 1456482801, 1456591336,
1456839506, 1456945790, 1457003644, 1457099216, 1457109800, 1457445783,
1457480525, 1457480533, 1457542907, 1457563544, 1458598877, 1458822887,
1458941209, 1459028558, 1459083990), class = c("POSIXct", "POSIXt"
))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-48L), .Names = c("id", "tstart", "tstop"))
> head(df)
# A tibble: 6 × 3
id tstart tstop
<dbl> <dttm> <dttm>
1 115 2016-01-04 19:14:06 2016-01-04 19:14:15
2 115 2016-01-04 19:14:15 2016-01-04 19:14:16
3 115 2016-01-04 19:14:16 2016-01-04 20:00:00
4 115 2016-01-04 20:00:00 2016-01-04 23:32:06
5 115 2016-01-18 04:52:09 2016-01-18 04:56:26
6 115 2016-01-19 06:50:10 2016-01-19 06:56:10
我尝试创建事件序列,event.seq
,其中事件被定义为上一行的延续。每次更改时序列都会重置。我试图获得的最终数据框是:
> dput(df.out)
structure(list(id = c(115, 115, 115, 115, 115, 115, 115, 115,
115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115,
115, 115, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
119), tstart = structure(c(1451960046, 1451960055, 1451960056,
1451962800, 1453117929, 1453211410, 1453241664, 1453683210, 1453859618,
1453923350, 1454160212, 1454334295, 1454667974, 1455498598, 1455551174,
1457003430, 1457445523, 1457542159, 1458598425, 1458822311, 1458940977,
1459028316, 1459083563, 1452369409, 1452392490, 1452656773, 1452768075,
1453158614, 1453472208, 1453501656, 1454185221, 1454893810, 1455228853,
1455586503, 1455652857, 1455747333, 1455965433, 1456053421, 1456137889,
1456482398, 1456590733, 1456839351, 1456945452, 1457099049, 1457108703,
1457478749, 1457480525, 1457562948), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), tstop = structure(c(1451960055, 1451960056,
1451962800, 1451975526, 1453118186, 1453211770, 1453242132, 1453683500,
1453859899, 1453923567, 1454161008, 1454334848, 1454668930, 1455499217,
1455552432, 1457003644, 1457445783, 1457542907, 1458598877, 1458822887,
1458941209, 1459028558, 1459083990, 1452370178, 1452392808, 1452656986,
1452768517, 1453158918, 1453472619, 1453502485, 1454185580, 1454894182,
1455229448, 1455587211, 1455653538, 1455747987, 1455965658, 1456053774,
1456138469, 1456482801, 1456591336, 1456839506, 1456945790, 1457099216,
1457109800, 1457480525, 1457480533, 1457563544), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), event.seq = c(1, 1, 1, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 23, 24)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -48L), .Names = c("id", "tstart", "tstop",
"event.seq"))
> head(df.out)
# A tibble: 6 × 4
id tstart tstop event.seq
<dbl> <dttm> <dttm> <dbl>
1 115 2016-01-05 02:14:06 2016-01-05 02:14:15 1
2 115 2016-01-05 02:14:15 2016-01-05 02:14:15 1
3 115 2016-01-05 02:14:15 2016-01-05 03:00:00 1
4 115 2016-01-05 03:00:00 2016-01-05 06:32:06 1
5 115 2016-01-18 11:52:09 2016-01-18 11:56:26 2
6 115 2016-01-19 13:50:10 2016-01-19 13:56:09 3
这让我更接近,但不是我想要的:
df.2 <- df %>%
arrange(id, tstart) %>%
mutate(tstart.ahead = lead(tstart)) %>%
mutate(tstop.behind = lag(tstop)) %>%
mutate(event.seq.1 = as.numeric(tstop == tstart.ahead), event.seq.2 = as.numeric(tstart == tstop.behind)) %>%
mutate(event.seq = pmax(event.seq.1, event.seq.2, na.rm = TRUE)) %>%
select(id, tstart, tstop, event.seq)
答案 0 :(得分:1)
这有点棘手。由于您要为每个id
重置,我们肯定需要group_by(id)
。然后我们将创建一个列,指示每行是否不是前一行的延续。最后,我们可以使用此指标的cumsum
。如果不延续,则添加1并且event.seq
增加。如果是延续,则添加0并且event.seq
保持不变。我们加1表示从1开始而不是0。
library(dplyr)
df.2 <- df %>%
arrange(id, tstart) %>%
group_by(id) %>%
mutate(not_continued = c(0, (lag(tstop) != tstart)[-1]),
event.seq = 1 + cumsum(not_continued)) %>%
select(-not_continued)
all.equal(df.2, df.out)
# [1] TRUE