我正在研究previous question之后的数据聚合的下一步。乔恩·斯普林(Jon Spring)向我指出了一种在给定时间间隔内指示 active 事件数量的解决方案。
下一步,我希望能够汇总此数据并获得在固定时间间隔内任何时间处于活动状态的具有相同ID的观测值。
从具有七个ID的七个事件的玩具数据集开始:
library(tidyverse); library(lubridate)
df1 <- tibble::tibble(
id = c("a", "b", "c", "c", "c", "d", "e"),
start = c(ymd_hms("2018-12-10 13:01:00"),
ymd_hms("2018-12-10 13:07:00"),
ymd_hms("2018-12-10 14:45:00"),
ymd_hms("2018-12-10 14:48:00"),
ymd_hms("2018-12-10 14:52:00"),
ymd_hms("2018-12-10 14:45:00"),
ymd_hms("2018-12-10 14:45:00")),
end = c(ymd_hms("2018-12-10 13:05:00"),
ymd_hms("2018-12-10 13:17:00"),
ymd_hms("2018-12-10 14:46:00"),
ymd_hms("2018-12-10 14:50:00"),
ymd_hms("2018-12-10 15:01:00"),
ymd_hms("2018-12-10 14:51:00"),
ymd_hms("2018-12-10 15:59:00")))
我可以在数据帧的每一行上进行蛮力循环,并将每条记录“扩展”到指定的间隔,该间隔涵盖从开始到结束的时间段,此处使用15分钟:
for (i in 1:nrow(df1)) {
right <- df1 %>%
slice(i) %>%
mutate(start_floor = floor_date(start, "15 mins"))
left <- tibble::tibble(
timestamp = seq.POSIXt(right$start_floor,
right$end,
by = "15 mins"),
id = right$id)
if (i == 1){
result <- left
}
else {
result <- bind_rows(result, left) %>%
distinct()
}
}
然后通过简单的聚合即可获得最终结果:
result_agg <- result %>%
group_by(timestamp) %>%
summarise(users_mac = n())
这给出了理想的结果,但可能无法很好地扩展到我需要与之配合使用的数据集(目前约有700万条记录。而且还在不断增长)。
这个问题有更好的解决方法吗?
答案 0 :(得分:2)
我不确定效率,但是做到这一点的一种方法是创建一个从数据的最小时间到最大时间的15分钟间隔时间序列,然后找到那个时间的用户。
library(tidyverse)
library(lubridate)
timestamp = floor_date(seq(min(df1$start), max(df1$end), by = "15 mins"), "15 mins")
tibble(timestamp) %>%
mutate(users_mac = map_dbl(timestamp,~with(df1, n_distinct(id[(
start > . | end > .) & (start < . + minutes(15) | end < . + minutes(15))])))) %>%
filter(users_mac != 0)
# timestamp users_mac
# <dttm> <dbl>
#1 2018-12-10 13:00:00 2
#2 2018-12-10 13:15:00 1
#3 2018-12-10 14:45:00 3
#4 2018-12-10 15:00:00 2
#5 2018-12-10 15:15:00 1
#6 2018-12-10 15:30:00 1
#7 2018-12-10 15:45:00 1
答案 1 :(得分:2)
使用 tsibble 软件包可以实现整洁的解决方案。
library(tidyverse)
#> Registered S3 methods overwritten by 'ggplot2':
#> method from
#> [.quosures rlang
#> c.quosures rlang
#> print.quosures rlang
#> Registered S3 method overwritten by 'rvest':
#> method from
#> read_xml.response xml2
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
library(tsibble, warn.conflicts = FALSE)
df1 <- tibble(
id = c("a", "b", "c", "c", "c", "d", "e"),
start = c(ymd_hms("2018-12-10 13:01:00"),
ymd_hms("2018-12-10 13:07:00"),
ymd_hms("2018-12-10 14:45:00"),
ymd_hms("2018-12-10 14:48:00"),
ymd_hms("2018-12-10 14:52:00"),
ymd_hms("2018-12-10 14:45:00"),
ymd_hms("2018-12-10 14:45:00")),
end = c(ymd_hms("2018-12-10 13:05:00"),
ymd_hms("2018-12-10 13:17:00"),
ymd_hms("2018-12-10 14:46:00"),
ymd_hms("2018-12-10 14:50:00"),
ymd_hms("2018-12-10 15:01:00"),
ymd_hms("2018-12-10 14:51:00"),
ymd_hms("2018-12-10 15:59:00")))
df1 %>%
mutate(
start = floor_date(start, "15 mins"),
end = floor_date(end, "15 mins")
) %>%
gather("label", "index", start:end) %>%
distinct(id, index) %>%
mutate(date = as_date(index)) %>%
as_tsibble(key = c(id, date), index = index) %>%
fill_gaps() %>%
index_by(index) %>%
summarise(users_mac = n())
#> # A tsibble: 7 x 2 [15m] <UTC>
#> index users_mac
#> <dttm> <int>
#> 1 2018-12-10 13:00:00 2
#> 2 2018-12-10 13:15:00 1
#> 3 2018-12-10 14:45:00 3
#> 4 2018-12-10 15:00:00 2
#> 5 2018-12-10 15:15:00 1
#> 6 2018-12-10 15:30:00 1
#> 7 2018-12-10 15:45:00 1
由reprex package(v0.2.1)于2019-05-17创建
答案 2 :(得分:0)
使用lubridate的as.interval()
和int_overlaps()
函数,然后整理一些tidyverse数据以获取摘要数据:
library(dplyr)
library(tidyr)
library(lubridate)
# list of 15-minute time increments (buckets)
timestamp <- tibble(start = floor_date(seq(min(df1$start), max(df1$end), by = "15 mins"), "15 mins"),
end = lead(start, 1),
interval = as.interval(start, end)) %>%
na.omit() %>%
.$interval
# add in interval on df1 start -- end times
df1 <- mutate(df1, interval = as.interval(start, end))
# find if each record is in each bucket - may not scale if there are many buckets?
tmp <- sapply(df1$interval,
function(x, timestamp) int_overlaps(x, timestamp),
timestamp) %>%
t()
colnames(tmp) <- int_start(timestamp) %>% as.character()
# count how many unique ids in each time bucket
bind_cols(df1, as_tibble(tmp)) %>%
select(-start, -end, -interval) %>%
gather(key = start, value = logged, -id) %>%
filter(logged) %>%
group_by(start) %>%
summarise(n = n_distinct(id))
# A tibble: 7 x 2
start n
<chr> <int>
1 2018-12-10 13:00:00 2
2 2018-12-10 13:15:00 1
3 2018-12-10 14:30:00 3
4 2018-12-10 14:45:00 3
5 2018-12-10 15:00:00 2
6 2018-12-10 15:15:00 1
7 2018-12-10 15:30:00 1