Question

我有以下数据：

StartDate   |  EndDate
-------------------------
1982.03.02  |  1982.09.30 
1982.10.01  |  1985.01.17 
1985.06.26  |  1985.07.26 
1985.07.30  |  1991.12.31 
1992.01.01  |  1995.12.31 
1996.01.01  |  2004.05.31 
2004.06.05  |  2006.01.31 
2006.02.01  |  2011.05.20

我需要合并任何相邻的间隔（开始日期和结束日期都包含在间隔中，因此结束于2003.05.06的间隔与2003.05.07开始的间隔相邻），所以在这种情况下，结果集应该是：

StartDate   |  EndDate
-------------------------
1982.03.02  |  1985.01.17 
1985.06.26  |  1985.07.26 
1985.07.30  |  2004.05.31 
2004.06.05  |  2011.05.20

对我来说，显而易见的方法是使用游标迭代集合，并逐行构造结果集。但是，此功能将位于可能在一天内在重负载的服务器上被调用数千次的代码内，因此我不希望出现任何性能问题。任何数据集都很小（20行顶部），数据范围很大，因此任何生成范围内所有日期的解决方案都是不可行的。

有没有更好的方式我没有看到？

初始化代码（来自Damien的答案）：

CREATE TABLE Periods (
    StartDate datetime NOT NULL CONSTRAINT PK_Periods PRIMARY KEY CLUSTERED,
    EndDate datetime NOT NULL
)

INSERT INTO Periods(StartDate,EndDate)
SELECT '19820302', '19820930'
UNION ALL SELECT '19821001', '19850117'
UNION ALL SELECT '19850626', '19850726'
UNION ALL SELECT '19850730', '19911231'
UNION ALL SELECT '19920101', '19951231'
UNION ALL SELECT '19960101', '20040531'
UNION ALL SELECT '20040605', '20060131'
UNION ALL SELECT '20060201', '20110520'

Answer 1

这是一个迄今为止执行最佳提交的查询，在执行计划中只有两个表访问（而不是三个或更多）。所有查询当然都有索引帮助。请注意，执行计划将此查询评为更昂贵，但实际的读取和放大。 CPU明显更好。执行计划中的估计成本与实际绩效不同。

WITH Grps AS (
   SELECT
      (Row_Number() OVER (ORDER BY P1.StartDate) - 1) / 2 Grp,
      P1.StartDate,
      P1.EndDate
   FROM
      Periods P1
      CROSS JOIN (SELECT -1 UNION ALL SELECT 1) D (Dir)
      LEFT JOIN Periods P2 ON
         DateAdd(Day, D.Dir, P1.StartDate) = P2.EndDate
         OR DateAdd(Day, D.Dir, P1.EndDate) = P2.StartDate
   WHERE
      (Dir = -1 AND P2.EndDate IS NULL)
      OR (Dir = 1 AND P2.StartDate IS NULL)
)
SELECT
   Min(StartDate) StartDate,
   Max(EndDate) EndDate
FROM Grps
GROUP BY Grp;

我认为值得一提的另一件事是，如果您使用独占结束日期（也称为“开放”结束日期）而不是封闭日期，那么在大多数情况下查询日期周期表会更简单且性能更好：

StartDate   | EndDate     | EndDate
(Inclusive) | (Inclusive) | (Exclusive)
---------------------------------------
1982.03.02  | 1982.09.30  | 1982.10.01
1982.10.01  | 1985.01.17  | 1985.01.18

在大多数情况下，使用独占结束日期是（在我看来）最佳做法，因为它允许您更改日期列的数据类型或更改日期的分辨率，而不会影响任何查询，代码或其他逻辑。例如，如果您的日期需要到最近的12小时而不是24小时，那么您需要做大量工作才能完成，而如果您使用独家结束日期，那么一件事就不得不改变！

如果您使用独家结束日期，我的查询将如下所示：

WITH Grps AS (
   SELECT
      (Row_Number() OVER (ORDER BY P1.StartDate) - 1) / 2 Grp,
      P1.StartDate,
      P1.EndDate
   FROM
      Periods P1
      CROSS JOIN (SELECT 1 UNION ALL SELECT 2) X (Which)
      LEFT JOIN Periods P2 ON
         (X.Which = 1 AND P1.StartDate = P2.EndDate)
         OR (X.Which = 2 AND P1.EndDate = P2.StartDate)
   WHERE
      P2.EndDate IS NULL
      OR P2.StartDate IS NULL
)
SELECT
   Min(StartDate) StartDate,
   Max(EndDate) EndDate
FROM Grps
GROUP BY Grp;

请注意，现在没有DateAdd或DateDiff，硬编码值为“1天”，如果您转换为12小时，则必须更改。

更新

这是一个更新的查询，其中包含了我在过去近5年中学到的内容。这个查询现在根本没有连接，虽然它确实有3个排序操作，这可能是性能问题，我认为这个查询将相当好地竞争，并且在没有索引的情况下可能会击败所有其他人。

WITH Groups AS (
   SELECT Grp = Row_Number() OVER (ORDER BY StartDate) / 2, *
   FROM
      #Periods
      (VALUES (0), (0)) X (Dup)
), Ranges AS (
   SELECT StartDate = Max(StartDate), EndDate = Min(EndDate)
   FROM Groups
   GROUP BY Grp
   HAVING Max(StartDate) <> DateAdd(day, 1, Min(EndDate))
), ReGroups AS (
   SELECT
      Grp = Row_Number() OVER (ORDER BY StartDate) / 2,
      StartDate,
      EndDate
   FROM
      Ranges
      CROSS JOIN (VALUES (0), (0)) X (Dup)
)
SELECT
   StartDate = Min(StartDate),
   EndDate = Max(EndDate)
FROM ReGroups
GROUP BY Grp
HAVING Count(*) = 2
;

这是另一个使用窗口函数的版本（以前的查询模拟的类型）：

WITH LeadLag AS (
   SELECT
      PrevEndDate = Coalesce(Lag(EndDate) OVER (ORDER BY StartDate), '00010101'),
      NextStartDate = Coalesce(Lead(StartDate) OVER (ORDER BY StartDate), '99991231'),
      *
   FROM #Periods
), Dates AS (
   SELECT
      X.*
   FROM
      LeadLag
      CROSS APPLY (
         SELECT
            StartDate = CASE WHEN DateAdd(day, 1, PrevEndDate) <> StartDate THEN StartDate ELSE NULL END,
            EndDate = CASE WHEN DateAdd(day, 1, EndDate) <> NextStartDate THEN EndDate ELSE NULL END
      ) X
   WHERE
      X.StartDate IS NOT NULL
      OR X.EndDate IS NOT NULL
), Final AS (
   SELECT
      StartDate,
      EndDate = Min(EndDate) OVER (ORDER BY EndDate ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
   FROM Dates
)
SELECT *
FROM Final
WHERE StartDate IS NOT NULL
;

Answer 2

设置示例数据比编写查询需要更长的时间 - 如果您发布包含CREATE TABLE和INSERT/SELECT语句的问题会更好。我不知道你的桌子叫什么，我称之为我的期间：

create table Periods (
    StartDate date not null,
    EndDate date not null
)
go
insert into Periods(StartDate,EndDate)
select '19820302','19820930' union all
select '19821001','19850117' union all
select '19850626','19850726' union all
select '19850730','19911231' union all
select '19920101','19951231' union all
select '19960101','20040531' union all
select '20040605','20060131' union all
select '20060201','20110520'
go
; with MergedPeriods as (
    Select p1.StartDate, p1.EndDate
    from
        Periods p1
            left join
        Periods p2
            on
                p1.StartDate = DATEADD(day,1,p2.EndDate)
    where
        p2.StartDate is null
    union all
    select p1.StartDate,p2.EndDate
    from
        MergedPeriods p1
            inner join
        Periods p2
            on
                p1.EndDate = DATEADD(day,-1,p2.StartDate)
)
select StartDate,MAX(EndDate) as EndDate
from MergedPeriods group by StartDate

结果：

StartDate   EndDate
1982-03-02  1985-01-17
1985-06-26  1985-07-26
1985-07-30  2004-05-31
2004-06-05  2011-05-20

Answer 3

你可以查看头部：开始一段时间的行。然后在子查询中搜索下一个头之前的最后结束日期：

; with heads as
        (
        select  StartDate
        ,       EndDate
        ,       row_number() over (order by StartDate) as rn
        from    @YourTable h
        where   not exists
                (
                select  *
                from    @YourTable next
                where   next.EndDate = dateadd(day, -1, h.StartDate)
                )
        )
select  heads.StartDate
,       (
        select  top 1 EndDate
        from    @YourTable
        where   EndDate < COALESCE(
                (
                select  StartDate
                from    heads h2
                where   heads.rn + 1 = h2.rn
                ), '9999-01-01')
        order by
                EndDate desc
        ) as EndDate
from    heads

Example at ODATA.

Answer 4

嗯......我知道你说过

任何生成范围内所有日期的解决方案都是不可行的。

但出于某种原因，我只想表明如何做到这一点。我不是故意浪费你的时间。

首先，如果您还没有数字表，请创建一个数字表。

CREATE TABLE Numbers (
   Num int NOT NULL CONSTRAINT PK_Numbers PRIMARY KEY CLUSTERED
)
INSERT Numbers VALUES (0)
WHILE @@RowCount < 65536
   INSERT Numbers SELECT Num FROM Numbers + (SELECT Max(Num) FROM Numbers) + 1

然后将一些岛屿分组！

WITH Dts AS (
   SELECT
      DateAdd(Day, Num, StartDate) Dt,
      DateAdd(
         Day,
         -DENSE_RANK() OVER (ORDER BY StartDate, Num),
         DateAdd(Day, Num, StartDate)
      ) Grp
   FROM
      Periods P
      INNER JOIN Numbers N ON DateDiff(Day, P.StartDate, P.EndDate) >= N.Num
)
SELECT Min(Dt) StartDate, Max(Dt) EndDate
FROM Dts
GROUP BY Grp
ORDER BY StartDate

如果您使用的是SQL 2000，则无法使用，请告知我们，我会为您提供另一种解决方案。

Answer 5

这是PostgreSQL的一个非常相似的主题：

PostgreSQL matching interval between start and end time against timestamp

我只是对T-SQL有点熟悉，所以我不完全确定这个内容适用于你，但一般的想法是另外存储一个带有GIST（或R-tree）索引的可索引几何类型，并查询它。这将使查询非常快。

（以下示例段代码来自peufeu的回复，也适用于日期范围）：

CREATE TABLE segments( start INTEGER NOT NULL, stop INTEGER NOT NULL, range_box BOX NOT NULL );
INSERT INTO segments SELECT n,n+1,BOX(POINT(n,-1),POINT(n+1,1)) FROM generate_series( 1, 1000000 ) n;
CREATE INDEX segments_box ON segments USING gist( range_box );
CREATE INDEX segments_start ON segments(start);
CREATE INDEX segments_stop ON segments(stop);

EXPLAIN ANALYZE SELECT * FROM segments WHERE 300000 BETWEEN start AND stop;
 Index Scan using segments_start on segments  (cost=0.00..12959.24 rows=209597 width=72) (actual time=91.990..91.990 rows=2 loops=1)
   Index Cond: (300000 >= start)
   Filter: (300000 <= stop)
 Total runtime: 92.023 ms

EXPLAIN ANALYZE SELECT * FROM segments WHERE range_box && '(300000,0,300000,0)'::BOX;
 Bitmap Heap Scan on segments  (cost=283.49..9740.27 rows=5000 width=72) (actual time=0.036..0.037 rows=2 loops=1)
   Recheck Cond: (range_box && '(300000,0),(300000,0)'::box)
   ->  Bitmap Index Scan on segments_box  (cost=0.00..282.24 rows=5000 width=0) (actual time=0.032..0.032 rows=2 loops=1)
         Index Cond: (range_box && '(300000,0),(300000,0)'::box)
 Total runtime: 0.064 ms

同样，上面是特定于PostgreSQL的，但是如果T-SQL中存在所需的类型/运算符/索引，也可能值得一看。

Answer 6

旧线程，但如果有人在PostGIS中寻找执行此操作的实现，请举例说明：

-- Create the data:
drop table if exists periods;
create temporary table periods as
select '19820302'::date as StartDate,'19820930'::date as EndDate union all
select '19821001'::date,'19850117'::date union all
select '19850626'::date,'19850726'::date union all
select '19850730'::date,'19911231'::date union all
select '19920101'::date,'19951231'::date union all
select '19960101'::date,'20040531'::date union all
select '20040605'::date,'20060131'::date union all
select '20060201'::date,'20110520'::date;

-- Run with PostGIS
-- Convert all intervals to lines, and then do point intersection.
select 
  '1970-01-01'::date+st_x(st_astext(st_pointn(line,1)))::int4 as start, 
  '1970-01-01'::date+st_x(st_astext(st_pointn(line,st_numpoints(line))))::int4-1 as end 
from 
(select (st_dump(st_linemerge(st_union(the_geom)))).geom as line from 
(select st_makeline(st_makepoint(startdate-'1970-01-01',0),
        st_makepoint(enddate-'1970-01-01'+1,0)) as the_geom from periods)t 
)x;  

-- Result
start       |  end
-------------------------
1982-03-02  |  1985-01-17 
1985-06-26  |  1985-07-26 
1985-07-30  |  2004-05-31 
2004-06-05  |  2011-05-20

Answer 7

alter table MergedPeriods (
   StartDate date not null,
EndDate date not null
)
go
insert into MergedPeriods(StartDate,EndDate)
select '20130210','20130215' union all
select '20130216','20130228' union all
select '20130302','20130312' union all
select '20130317','20130325' union all
select '20130326','20130405' union all
select '20130406','20130411' union all
select '20130502','20130610' 
go
; with MergedPeriods as (
    Select p1.StartDate, p1.EndDate
    from
        [test].[dbo].[Periods] p1
            left join
        [test].[dbo].[Periods] p2
            on
                p1.StartDate = DATEADD(day,1,p2.EndDate)
    where

       p2.StartDate is null
    union all
    select p1.StartDate,p2.EndDate
    from
        MergedPeriods p1
            inner join
        [test].[dbo].[Periods] p2
            on
                p1.EndDate = DATEADD(day,-1,p2.StartDate)



)

select MIN(StartDate),MAX(EndDate) as EndDate
from MergedPeriods group by StartDate

合并SQL Server中的日期间隔

7 个答案:

更新