按时间和日期提取连续比赛

时间:2017-05-17 09:32:46

标签: r aggregate

我的数据框dat包含标题Date HourTYPE,如下所示:

dat <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L), .Label = "04/20/17", class = "factor"), 
    Hour = c("14:30:00", "14:31:00", "14:32:00", "14:33:00", 
    "14:34:00", "14:35:00", "14:36:00", "14:37:00", "14:38:00", 
    "14:39:00", "14:40:00", "14:41:00", "14:42:00", "14:43:00", 
    "14:44:00", "14:45:00", "14:46:00", "14:47:00", "14:48:00", 
    "14:49:00", "14:50:00", "14:51:00", "14:52:00", "14:53:00", 
    "14:54:00", "14:55:00", "14:56:00", "14:57:00", "14:58:00", 
    "14:59:00", "15:00:00", "15:01:00", "15:02:00", "15:03:00", 
    "15:04:00", "15:05:00", "15:06:00", "15:07:00", "15:08:00", 
    "15:09:00", "15:10:00", "15:11:00", "15:12:00", "15:13:00", 
    "15:14:00", "15:15:00", "15:16:00", "15:17:00", "15:18:00", 
    "15:19:00", "15:20:00", "15:21:00", "15:22:00", "15:23:00", 
    "15:24:00", "15:25:00", "15:26:00", "15:27:00", "15:28:00", 
    "15:29:00", "15:30:00", "15:31:00", "15:32:00", "15:33:00", 
    "15:34:00", "15:35:00", "15:36:00", "15:37:00", "15:38:00", 
    "15:39:00", "15:40:00", "15:41:00", "15:42:00", "15:43:00", 
    "15:44:00", "15:45:00", "15:46:00", "15:47:00", "15:48:00", 
    "15:49:00", "15:50:00", "15:51:00", "15:52:00", "15:53:00", 
    "15:54:00", "15:55:00", "15:56:00", "15:57:00", "15:58:00", 
    "15:59:00", "16:00:00", "16:01:00", "16:02:00", "16:03:00", 
    "16:04:00", "16:05:00", "16:06:00", "16:07:00", "16:08:00", 
    "16:09:00", "16:10:00", "16:11:00", "16:12:00", "16:13:00", 
    "16:14:00", "16:15:00", "16:16:00", "16:17:00", "16:18:00", 
    "16:19:00", "16:20:00", "16:21:00", "16:22:00", "16:23:00", 
    "16:24:00", "16:25:00", "16:26:00", "16:27:00", "16:28:00", 
    "16:29:00", "16:30:00", "16:31:00", "16:32:00", "16:33:00", 
    "16:34:00"), TYPE = c(3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
    2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3)), .Names = c("Date", 
"Hour", "TYPE"), row.names = 2:126, class = "data.frame")

在此示例中,数据框Date仅代表一天,但可能有很多天。 TYPE是数字,对应于所采取的行为(1-3)。

背景 连续跟随的类似TYPE数字表示正在进行的相同行为。因此,在14:30:00和14:37:00之间进行了单一行为,我想将其称为单Bout

我的问题: 我怎样才能对数据进行排序,以便最终得到如下数据框:

编辑包括下一天:

  Date      Bout   Start      End      TYPE
4/20/17      1   14:30:00   14:37:00     3
4/20/17      2   14:38:00   15:27:00     1
4/20/17      3   15:28:00   16:03:00     2
4/20/17      4   16:04:00   16:23:00     1
4/20/17      5   16:24:00   16:34:00     3 
4/21/17      1   13:00:00   13:01:00     1
4/21/17      2   13:02:00   13:05:00     2 

我想根据Date对它们进行分组,所以这里的日期将开始4/20/17,下一个(虽然未在示例中显示)将是4/21/17.

我使用aggregate收集了类似的数据,例如:

setNames(aggregate(cbind(dat$TYPE == 1, dat$TYPE == 2, dat$TYPE == 3), by=list(Category=dat$Date), FUN = sum), c("Date", "Standing", "Lying_right", "Lying_left"))但无法确定如何在此上下文中使用它。也许某个函数可以放在FUN的{​​{1}}中?

4 个答案:

答案 0 :(得分:1)

我们可以使用library(dplyr) dat %>% group_by(Date, Bout = cumsum(c(1, diff(TYPE) != 0))) %>% summarise(start = head(Hour, 1), End = tail(Hour, 1), TYPE = unique(TYPE)) #Source: local data frame [5 x 5] #Groups: Date [?] # Date Bout start End TYPE # <fctr> <dbl> <chr> <chr> <dbl> #1 04/20/17 1 14:30:00 14:37:00 3 #2 04/20/17 2 14:38:00 15:27:00 1 #3 04/20/17 3 15:28:00 16:03:00 2 #4 04/20/17 4 16:04:00 16:23:00 1 #5 04/20/17 5 16:24:00 16:34:00 3 interface ImagesAdi { @GET("android/determinace/json/images.json") fun getImages(): Single<List<ImagesResponse>> } 的分组取差值不等于0的值(即不同的值)的cumsum。

interface ImagesAdi {
    @GET("android/determinace/json/images.json")
    fun getImages(): Single<ImagesResponse>
}

答案 1 :(得分:1)

您可以使用data.table

library(data.table)
setDT(dat)
dat[,.(Date=first(Date),Start=first(Hour),End=last(Hour)),by=.(Bout=rleid(TYPE),TYPE)]
# Bout TYPE     Date    Start      End
# 1:    1    3 04/20/17 14:30:00 14:37:00
# 2:    2    1 04/20/17 14:38:00 15:27:00
# 3:    3    2 04/20/17 15:28:00 16:03:00
# 4:    4    1 04/20/17 16:04:00 16:23:00
# 5:    5    3 04/20/17 16:24:00 16:34:00

答案 2 :(得分:1)

另一个data.table解决方案。将日期和时间作为时间戳处理,以便能够在两天结束的情况下处理案例。

library(data.table)
setDT(dat)
dat[, timestamp := strptime(paste(as.character(Date), Hour), format = "%m/%d/%y %T")]
dat[, bout_id := rep(seq(length(rle(TYPE)$values)),  rle(TYPE)$lengths)]
dat[, list(Start = min(timestamp), End = max(timestamp), TYPE = TYPE[1]),by = bout_id]

答案 3 :(得分:1)

以下是使用rlereshape一天工作的基本R方法。

# get information on the length of each bout
temp <- rle(dat$TYPE)
stoppers <- cumsum(temp$lengths)
stoppers <- sort(c(1, stoppers, head(stoppers + 1, -1)))

# subset data according to start and stop point of each bout
datNew <- dat[stoppers, ]
# provide id for each bout
datNew$Bout <- rep(seq_along(temp$lengths), each=2)
# provide start and stop value
datNew$timevar <- 1:2

现在,重塑广泛产生

reshape(datNew, idvar="Bout", direction="wide", v.names="Hour", timevar="timevar")
        Date TYPE Bout   Hour.1   Hour.2
2   04/20/17    3    1 14:30:00 14:37:00
10  04/20/17    1    2 14:38:00 15:27:00
60  04/20/17    2    3 15:28:00 16:03:00
96  04/20/17    1    4 16:04:00 16:23:00
116 04/20/17    3    5 16:24:00 16:34:00

如果要在不同的日期执行此操作,可以将data.frame拆分为日期的data.frames列表,将上述函数应用于每个data.frame,然后使用do.call在结果上使用rbind来组合每个日期。这看起来像

do.call(rbind,
        lapply(split(dat, dat$Date), function(x) {
          temp <- rle(x$TYPE)
          stoppers <- cumsum(temp$lengths)
          stoppers <- sort(c(1, stoppers, head(stoppers + 1, -1)))

          # subset data according to start and stop point of each bout
          datNew <- x[stoppers, ]
          # provide id for each bout
          datNew$Bout <- rep(seq_along(temp$lengths), each=2)
          # provide start and stop value
          datNew$timevar <- 1:2

          reshape(datNew, idvar="Bout", direction="wide", v.names="Hour", timevar="timevar")
}))