答案 0 :(得分:2)
这是一种空白和岛屿问题。您可以通过累积最大值end_time
并与start_time
进行比较来找出岛屿的起点。
还有一些其他逻辑可以处理NULL
end_time
。由于数据全部(大概)是过去的,因此我们将在将来的某个时间替换它,然后将其转换回NULL
。
所以:
select id,
min(start_time) as start_time,
nullif(max(end_time), datetime('2100-01-01')) as end_time,
count(*) as cnt
from (select t.*,
countif( prev_max_end_time is null or prev_max_end_time < start_time) over (partition by id order by start_time) as grouping
from (select t.* except (end_time),
coalesce(end_time, datetime('2100-01-01')) as end_time,
max(coalesce(end_time, datetime('2100-01-01'))) over
(partition by id
order by start_time
rows between unbounded preceding and 1 preceding
) as prev_max_end_time
from t
) t
) t
group by id, grouping
order by id, min(start_time);
答案 1 :(得分:0)
我喜欢使用(我自己使用这种方法)带有窗口函数的解决方案,但这是另一个,只是为了好玩:)
with
sample_data as (
select
*
from
unnest(
array[
struct(1 as id, 'a' as unique_id, datetime'2019-07-21 20:30:05' as start_time, datetime'2019-07-21 20:30:10' as end_time),
struct(1, 'b', datetime'2019-07-21 20:30:07', datetime'2019-07-21 20:30:17'),
struct(1, 'c', datetime'2019-07-21 20:30:21', datetime'2019-07-21 20:30:25'),
struct(1, 'd', datetime'2019-07-21 20:30:28', datetime'2019-07-21 20:30:33'),
struct(1, 'e', datetime'2019-07-21 20:30:30', datetime'2019-07-21 20:30:36'),
struct(1, 'f', datetime'2019-07-21 20:30:29', datetime'2019-07-21 20:30:45'),
struct(1, 'g', datetime'2019-07-21 20:30:50', datetime'2019-07-21 20:30:53'),
struct(1, 'h', datetime'2019-07-21 20:30:52', datetime'2019-07-21 20:30:59'),
struct(1, 'i', datetime'2019-07-21 20:30:56', null),
struct(2, 'j', datetime'2019-07-21 20:30:07', datetime'2019-07-21 20:30:14'),
struct(2, 'k', datetime'2019-07-21 20:30:18', datetime'2019-07-21 20:30:30'),
struct(2, 'l', datetime'2019-07-21 20:30:21', datetime'2019-07-21 20:30:30'),
struct(2, 'm', datetime'2019-07-21 20:30:41', null),
struct(2, 'n', datetime'2019-07-21 20:30:48', null),
struct(3, 'o', datetime'2019-07-21 20:30:20', null),
struct(3, 'p', datetime'2019-07-21 20:30:30', null)
]
)
),
joins as (
select
sd1.*,
sd2.id as id2,
sd2.unique_id as unique_id2
from
sample_data as sd1
left join sample_data as sd2 on sd1.id = sd2.id
and sd1.start_time <= coalesce(sd2.end_time, datetime'2099-01-01')
and coalesce(sd1.end_time, datetime'2099-01-01') >= sd2.start_time
order by
1, 2, 3, 6
),
by_groups as (
select
j1.id,
j1.unique_id,
j1.start_time,
j1.end_time,
array_agg(distinct j2.unique_id order by j2.unique_id) as group_id
from
joins as j1
left join joins as j2 on j1.id = j2.id
and j1.unique_id2 = j2.unique_id2
and j2.unique_id != j2.unique_id2
group by
1, 2, 3, 4
)
select
id,
array_length(group_id) as number_of_unique_id,
min(start_time) as start_time,
nullif(
max(coalesce(
end_time,
datetime'2099-01-01')),
datetime'2099-01-01') as end_time
from
by_groups
group by
1, 2, array_to_string(group_id, ',')