在ID中:识别组并分配计数器

时间:2016-10-27 23:07:24

标签: r time data.table dplyr data-manipulation

我有一个如下所示的数据集:

> head(df)
# A tibble: 6 × 3
     id              tstart               tstop
  <dbl>              <dttm>              <dttm>
1   115 2016-01-04 19:14:06 2016-01-04 19:14:15
2   115 2016-01-04 19:14:15 2016-01-04 19:14:16
3   115 2016-01-04 19:14:16 2016-01-04 20:00:00
4   115 2016-01-04 20:00:00 2016-01-04 23:32:06
5   119 2016-01-09 12:56:49 2016-01-09 13:09:38
6   119 2016-01-09 19:21:30 2016-01-09 19:26:48


> dput(df)
structure(list(id = c(115, 115, 115, 115, 119, 119, 119, 119, 
115, 119, 115, 115, 119, 119, 115, 115, 115, 115, 119, 115, 115, 
119, 119, 115, 115, 119, 119, 119, 119, 119, 119, 119, 119, 119, 
119, 115, 119, 119, 115, 119, 119, 115, 119, 115, 115, 115, 115, 
115), tstart = structure(c(1451960046, 1451960055, 1451960056, 
1451962800, 1452369409, 1452392490, 1452656773, 1452768075, 1453117929, 
1453158614, 1453211410, 1453241664, 1453472208, 1453501656, 1453683210, 
1453859618, 1453923350, 1454160212, 1454185221, 1454334295, 1454667974, 
1454893810, 1455228853, 1455498598, 1455551174, 1455586503, 1455652857, 
1455747333, 1455965433, 1456053421, 1456137889, 1456482398, 1456590733, 
1456839351, 1456945452, 1457003430, 1457099049, 1457108703, 1457445523, 
1457478749, 1457480525, 1457542159, 1457562948, 1458598425, 1458822311, 
1458940977, 1459028316, 1459083563), class = c("POSIXct", "POSIXt"
), tzone = ""), tstop = structure(c(1451960055, 1451960056, 1451962800, 
1451975526, 1452370178, 1452392808, 1452656986, 1452768517, 1453118186, 
1453158918, 1453211770, 1453242132, 1453472619, 1453502485, 1453683500, 
1453859899, 1453923567, 1454161008, 1454185580, 1454334848, 1454668930, 
1454894182, 1455229448, 1455499217, 1455552432, 1455587211, 1455653538, 
1455747987, 1455965658, 1456053774, 1456138469, 1456482801, 1456591336, 
1456839506, 1456945790, 1457003644, 1457099216, 1457109800, 1457445783, 
1457480525, 1457480533, 1457542907, 1457563544, 1458598877, 1458822887, 
1458941209, 1459028558, 1459083990), class = c("POSIXct", "POSIXt"
))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-48L), .Names = c("id", "tstart", "tstop"))

> head(df)
# A tibble: 6 × 3
     id              tstart               tstop
  <dbl>              <dttm>              <dttm>
1   115 2016-01-04 19:14:06 2016-01-04 19:14:15
2   115 2016-01-04 19:14:15 2016-01-04 19:14:16
3   115 2016-01-04 19:14:16 2016-01-04 20:00:00
4   115 2016-01-04 20:00:00 2016-01-04 23:32:06
5   115 2016-01-18 04:52:09 2016-01-18 04:56:26
6   115 2016-01-19 06:50:10 2016-01-19 06:56:10

我尝试创建事件序列event.seq,其中事件被定义为上一行延续。每次更改时序列都会重置。我试图获得的最终数据框是:

> dput(df.out)
structure(list(id = c(115, 115, 115, 115, 115, 115, 115, 115, 
115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 
115, 115, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 
119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 
119), tstart = structure(c(1451960046, 1451960055, 1451960056, 
1451962800, 1453117929, 1453211410, 1453241664, 1453683210, 1453859618, 
1453923350, 1454160212, 1454334295, 1454667974, 1455498598, 1455551174, 
1457003430, 1457445523, 1457542159, 1458598425, 1458822311, 1458940977, 
1459028316, 1459083563, 1452369409, 1452392490, 1452656773, 1452768075, 
1453158614, 1453472208, 1453501656, 1454185221, 1454893810, 1455228853, 
1455586503, 1455652857, 1455747333, 1455965433, 1456053421, 1456137889, 
1456482398, 1456590733, 1456839351, 1456945452, 1457099049, 1457108703, 
1457478749, 1457480525, 1457562948), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), tstop = structure(c(1451960055, 1451960056, 
1451962800, 1451975526, 1453118186, 1453211770, 1453242132, 1453683500, 
1453859899, 1453923567, 1454161008, 1454334848, 1454668930, 1455499217, 
1455552432, 1457003644, 1457445783, 1457542907, 1458598877, 1458822887, 
1458941209, 1459028558, 1459083990, 1452370178, 1452392808, 1452656986, 
1452768517, 1453158918, 1453472619, 1453502485, 1454185580, 1454894182, 
1455229448, 1455587211, 1455653538, 1455747987, 1455965658, 1456053774, 
1456138469, 1456482801, 1456591336, 1456839506, 1456945790, 1457099216, 
1457109800, 1457480525, 1457480533, 1457563544), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), event.seq = c(1, 1, 1, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
20, 21, 22, 23, 23, 24)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -48L), .Names = c("id", "tstart", "tstop", 
"event.seq"))

> head(df.out)
# A tibble: 6 × 4
     id              tstart               tstop event.seq
  <dbl>              <dttm>              <dttm>     <dbl>
1   115 2016-01-05 02:14:06 2016-01-05 02:14:15         1
2   115 2016-01-05 02:14:15 2016-01-05 02:14:15         1
3   115 2016-01-05 02:14:15 2016-01-05 03:00:00         1
4   115 2016-01-05 03:00:00 2016-01-05 06:32:06         1
5   115 2016-01-18 11:52:09 2016-01-18 11:56:26         2
6   115 2016-01-19 13:50:10 2016-01-19 13:56:09         3

这让我更接近,但不是我想要的:

df.2 <-  df %>%
  arrange(id, tstart) %>%
  mutate(tstart.ahead = lead(tstart)) %>%
  mutate(tstop.behind = lag(tstop)) %>%
  mutate(event.seq.1 = as.numeric(tstop == tstart.ahead), event.seq.2 = as.numeric(tstart == tstop.behind)) %>%
  mutate(event.seq = pmax(event.seq.1, event.seq.2, na.rm = TRUE)) %>%
  select(id, tstart, tstop, event.seq)

1 个答案:

答案 0 :(得分:1)

这有点棘手。由于您要为每个id重置,我们肯定需要group_by(id)。然后我们将创建一个列,指示每行是否是前一行的延续。最后,我们可以使用此指标的cumsum。如果延续,则添加1并且event.seq增加。如果是延续,则添加0并且event.seq保持不变。我们加1表示从1开始而不是0。

library(dplyr)
df.2 <-  df %>%
    arrange(id, tstart) %>%
    group_by(id) %>%
    mutate(not_continued = c(0, (lag(tstop) != tstart)[-1]),
         event.seq = 1 + cumsum(not_continued)) %>%
    select(-not_continued)

all.equal(df.2, df.out)
# [1] TRUE