在PostgreSQL中合并重叠时间范围

时间:2016-07-14 21:05:50

标签: postgresql range recursive-query date-range recursive-cte

我有一个PostgreSQL(9.4)表,其中包含时间戳范围和用户ID,我需要将任何重叠范围(具有相同的用户ID)折叠到一个记录中。

我已经尝试了一组复杂的CTE来实现这一目标,但在我们的(40,000多行)真实表格中存在一些使问题复杂化的边缘情况。我得出的结论是,我可能需要一个递归的CTE,但我没有运气写作。

这是创建测试表并用数据填充它的一些代码。这不是我们表格的确切布局,但它足够接近一个例子。

CREATE TABLE public.test
(
  id serial,
  sessionrange tstzrange,
  fk_user_id integer
);

insert into test (sessionrange, fk_user_id)
values 
('[2016-01-14 11:57:01-05,2016-01-14 12:06:59-05]', 1)
,('[2016-01-14 12:06:53-05,2016-01-14 12:17:28-05]', 1)
,('[2016-01-14 12:17:24-05,2016-01-14 12:21:56-05]', 1)
,('[2016-01-14 18:18:00-05,2016-01-14 18:42:09-05]', 2)
,('[2016-01-14 18:18:08-05,2016-01-14 18:18:15-05]', 1)
,('[2016-01-14 18:38:12-05,2016-01-14 18:48:20-05]', 1)
,('[2016-01-14 18:18:16-05,2016-01-14 18:18:26-05]', 1)
,('[2016-01-14 18:18:24-05,2016-01-14 18:18:31-05]', 1)
,('[2016-01-14 18:18:12-05,2016-01-14 18:18:20-05]', 3)
,('[2016-01-14 19:32:12-05,2016-01-14 23:18:20-05]', 3)
,('[2016-01-14 18:18:16-05,2016-01-14 18:18:26-05]', 4)
,('[2016-01-14 18:18:24-05,2016-01-14 18:18:31-05]', 2);

我发现我可以这样做,以便按照开始时间对会话进行排序:

select * from test order by fk_user_id, sessionrange

我可以使用它来确定单个记录是否与前一个记录重叠,使用窗口函数:

SELECT *, sessionrange && lag(sessionrange) OVER (PARTITION BY fk_user_id ORDER BY sessionrange)
FROM test
ORDER BY fk_user_id, sessionrange

但这仅检测单个先前记录是否与当前记录重叠(请参阅记录id = 6)。我需要一直检测到分区的开头。

之后,我需要对任何重叠的记录进行分组,以找到最早会话的开始和最后一个会话的结束。

我确信有一种方法可以做到这一点,我忽视了。如何折叠这些重叠记录?

1 个答案:

答案 0 :(得分:1)

将重叠范围合并为数组元素相对容易。为简单起见,以下函数返回set of tstzrange

create or replace function merge_ranges(tstzrange[])
returns setof tstzrange language plpgsql as $$
declare
    t tstzrange;
    r tstzrange;
begin
    foreach t in array $1 loop
        if r && t then r:= r + t;
        else
            if r notnull then return next r;
            end if;
            r:= t;
        end if;
    end loop;
    if r notnull then return next r;
    end if;
end $$;

只需汇总用户的范围并使用该功能:

select fk_user_id, merge_ranges(array_agg(sessionrange))
from test 
group by 1
order by 1, 2

 fk_user_id |                    merge_ranges                     
------------+-----------------------------------------------------
          1 | ["2016-01-14 17:57:01+01","2016-01-14 18:21:56+01"]
          1 | ["2016-01-15 00:18:08+01","2016-01-15 00:18:15+01"]
          1 | ["2016-01-15 00:18:16+01","2016-01-15 00:18:31+01"]
          1 | ["2016-01-15 00:38:12+01","2016-01-15 00:48:20+01"]
          2 | ["2016-01-15 00:18:00+01","2016-01-15 00:42:09+01"]
          3 | ["2016-01-15 00:18:12+01","2016-01-15 00:18:20+01"]
          3 | ["2016-01-15 01:32:12+01","2016-01-15 05:18:20+01"]
          4 | ["2016-01-15 00:18:16+01","2016-01-15 00:18:26+01"]
(8 rows)    

或者,该算法可以在一个函数循环中应用于整个表。我不确定,但对于大型数据集,这种方法应该更快。

create or replace function merge_ranges_in_test()
returns setof test language plpgsql as $$
declare
    curr test;
    prev test;
begin
    for curr in
        select * 
        from test
        order by fk_user_id, sessionrange
    loop
        if prev notnull and prev.fk_user_id <> curr.fk_user_id then
            return next prev;
            prev:= null;
        end if;
        if prev.sessionrange && curr.sessionrange then 
            prev.sessionrange:= prev.sessionrange + curr.sessionrange;
        else
            if prev notnull then 
                return next prev;
            end if;
            prev:= curr;
        end if;
    end loop;
    return next prev;
end $$;

结果:

select *
from merge_ranges_in_test();

 id |                    sessionrange                     | fk_user_id 
----+-----------------------------------------------------+------------
  1 | ["2016-01-14 17:57:01+01","2016-01-14 18:21:56+01"] |          1
  5 | ["2016-01-15 00:18:08+01","2016-01-15 00:18:15+01"] |          1
  7 | ["2016-01-15 00:18:16+01","2016-01-15 00:18:31+01"] |          1
  6 | ["2016-01-15 00:38:12+01","2016-01-15 00:48:20+01"] |          1
  4 | ["2016-01-15 00:18:00+01","2016-01-15 00:42:09+01"] |          2
  9 | ["2016-01-15 00:18:12+01","2016-01-15 00:18:20+01"] |          3
 10 | ["2016-01-15 01:32:12+01","2016-01-15 05:18:20+01"] |          3
 11 | ["2016-01-15 00:18:16+01","2016-01-15 00:18:26+01"] |          4
(8 rows)

问题非常有趣。我试图找到一个递归的解决方案,但似乎程序性的尝试是最自然和最有效的。

我终于找到了一个递归解决方案。查询将删除重叠行并插入其压缩的等效项:

with recursive cte (user_id, ids, range) as (
    select t1.fk_user_id, array[t1.id, t2.id], t1.sessionrange + t2.sessionrange
    from test t1
    join test t2
        on t1.fk_user_id = t2.fk_user_id 
        and t1.id < t2.id
        and t1.sessionrange && t2.sessionrange
union all
    select user_id, ids || t.id, range + sessionrange
    from cte
    join test t
        on user_id = t.fk_user_id 
        and ids[cardinality(ids)] < t.id
        and range && t.sessionrange
    ),
list as (
    select distinct on(id) id, range, user_id
    from cte, unnest(ids) id
    order by id, upper(range)- lower(range) desc
    ),
deleted as (
    delete from test
    where id in (select id from list)
    )
insert into test
select distinct on (range) id, range, user_id
from list
order by range, id;

结果:

select *
from test
order by 3, 2;

 id |                    sessionrange                     | fk_user_id 
----+-----------------------------------------------------+------------
  1 | ["2016-01-14 17:57:01+01","2016-01-14 18:21:56+01"] |          1
  5 | ["2016-01-15 00:18:08+01","2016-01-15 00:18:15+01"] |          1
  7 | ["2016-01-15 00:18:16+01","2016-01-15 00:18:31+01"] |          1
  6 | ["2016-01-15 00:38:12+01","2016-01-15 00:48:20+01"] |          1
  4 | ["2016-01-15 00:18:00+01","2016-01-15 00:42:09+01"] |          2
  9 | ["2016-01-15 00:18:12+01","2016-01-15 00:18:20+01"] |          3
 10 | ["2016-01-15 01:32:12+01","2016-01-15 05:18:20+01"] |          3
 11 | ["2016-01-15 00:18:16+01","2016-01-15 00:18:26+01"] |          4
(8 rows)