如果日期列在TSQL中重叠,则合并行

时间:2019-05-07 08:31:40

标签: sql-server tsql sql-server-2008

我有一个以下格式的表

Id  StartDate   EndDate Type
1   2012-02-18  2012-03-18  1
1   2012-03-19  2012-06-19  1
1   2012-06-27  2012-09-27  1
1   2014-08-23  2014-09-24  3
1   2014-09-23  2014-10-24  3
1   2014-10-23  2014-11-24  3
2   2015-07-04  2015-08-06  1
2   2015-08-04  2015-09-06  1
3   2013-11-01  2013-12-01  0
3   2018-01-09  2018-02-09  0

我在这里找到了类似的问题,但是没有什么可以帮助我解决问题的。我想合并具有相同的IdType和重叠的日期期间的行。但是,如果前14天的EndDateStartDate之间存在差距,我想将其视为重叠。

上表的结果应该是

Id  StartDate   EndDate Type
1   2012-02-18  2012-09-27  1
1   2014-08-23  2014-11-24  3
2   2015-07-04  2015-09-06  1
3   2013-11-01  2013-12-01  0
3   2018-01-09  2018-02-09  0

在另一台服务器上,我可以做到以下限制和以下查询:

  • 没有检查重叠日期之间的间隔
  • 不关心Type列,而只关心Id
  • 具有SQL Server(2012)的较新版本,但是现在我有2008年,其代码不兼容

SELECT Id
     , MIN(StartDate) AS StartDate
     , MAX(EndDate) AS EndDate
FROM (
    SELECT *
         , SUM(CASE WHEN a.EndDate = a.StartDate THEN 0
                    ELSE 1
               END
           ) OVER (ORDER BY Id, StartDate) sm
    FROM (
        SELECT Id
             , StartDate
             , EndDate
             , LAG(EndDate, 1, NULL) OVER (PARTITION BY Id ORDER BY Id, EndDate) EndDate
        FROM #temptable
    ) a
) b
GROUP BY Id, sm

任何建议我怎么做

  • 在流程中包括类型
  • 考虑到重叠时有14天的间隔
  • 使其在SQL Server 2008上运行

5 个答案:

答案 0 :(得分:7)

此方法使用附加的临时表来标识重叠日期的组,然后根据这些组执行快速汇总。

SELECT *, ROW_NUMBER() OVER (ORDER BY Id, Type) AS UID,
    ROW_NUMBER() OVER (ORDER BY Id, Type) AS GroupId INTO #G FROM #TempTable
WHILE @@ROWCOUNT <> 0 BEGIN
    UPDATE T1 SET
        GroupId = T2.GroupId
    FROM #G T1
        INNER JOIN (
            SELECT T1.UID, CASE WHEN T1.GroupId  < T2.GroupId THEN T1.GroupId ELSE T2.GroupId END
            FROM #G T1
                LEFT OUTER JOIN #G T2
                    ON T1.Id = T2.Id AND T1.Type = T2.Type AND T1.GroupId <> T2.GroupId
                        AND T1.StartDate <= T2.EndDate AND T2.StartDate <= T1.EndDate
            ) T2 (UID, GroupId)
            ON T1.UID = T2.UID
    WHERE T1.GroupId <> T2.GroupId
END
SELECT Id, MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate, Type
FROM #G G GROUP BY GroupId, Id, Type

这将返回期望值

Id          StartDate  EndDate    Type
----------- ---------- ---------- -----------
1           2012-02-18 2012-09-27 1
1           2014-08-23 2014-11-24 3
2           2015-07-04 2015-09-06 1
3           2013-11-01 2013-12-01 0
3           2018-01-09 2018-02-09 0

答案 1 :(得分:2)

这与2008年兼容。我认为,CTE实际上是链接所有重叠记录的最佳方法。日期重叠逻辑来自以下线程:SO Date Overlap

我添加了更复杂的额外数据,以确保其正常工作。

DECLARE @Data table (Id INT, StartDate DATE, EndDate DATE, Type INT)
INSERT INTO @data 

SELECT 1,'2/18/2012' ,'3/18/2012', 1 UNION ALL
select 1,'3/17/2012','6/29/2012',1 UNION ALL
select 1,'6/27/2012','9/27/2012',1 UNION ALL
select 1,'8/23/2014','9/24/2014',3 UNION ALL
select 1,'9/23/2014','10/24/2014',3 UNION ALL
select 1,'10/23/2014','11/24/2014',3 UNION ALL
select 2,'7/4/2015','8/6/2015',1 UNION ALL
select 2,'8/4/2015','9/6/2015',1 UNION ALL
select 3,'11/1/2013','12/1/2013',0 UNION ALL
select 3,'1/9/2018','2/9/2018',0 UNION ALL 
select 4,'1/1/2018','1/2/2018',0 UNION ALL --many non overlapping dates
select 4,'1/4/2018','1/5/2018',0 UNION ALL
select 4,'1/7/2018','1/9/2018',0 UNION ALL
select 4,'1/11/2018','1/13/2018',0 UNION ALL

select 4,'2/7/2018','2/8/2018',0 UNION ALL --many overlapping dates
select 4,'2/8/2018','2/9/2018',0 UNION ALL 
select 4,'2/9/2018','2/10/2018',0 UNION all
select 4,'2/10/2018','2/11/2018',0 UNION all
select 4,'2/11/2018','2/12/2018',0 UNION all
select 4,'2/12/2018','2/13/2018',0 UNION all

select 4,'3/7/2018','3/8/2018',0 UNION ALL --many overlapping dates, second instance of id 4, type 0
select 4,'3/8/2018','3/9/2018',0 UNION ALL 
select 4,'3/9/2018','3/10/2018',0 UNION all
select 4,'3/10/2018','3/11/2018',0 UNION all
select 4,'3/11/2018','3/12/2018',0 UNION all
select 4,'3/12/2018','3/13/2018',0 



;
WITH cdata
AS (SELECT  Id,
        d.Type,
        d.StartDate,
        d.EndDate,
        CurrentStart = d.StartDate
    FROM    @Data d
    WHERE
        NOT EXISTS (
            SELECT * FROM @Data x WHERE x.StartDate < d.StartDate AND d.StartDate <= x.EndDate AND d.EndDate >= x.StartDate AND d.Id = x.Id AND d.Type = x.Type --get first records for overlapping ranges

                )       
    UNION ALL
    SELECT  d.Id,
        d.Type,
        StartDate = CASE WHEN d2.StartDate < d.StartDate THEN d2.StartDate ELSE d.StartDate END,
        EndDate = CASE WHEN d2.EndDate > d.EndDate THEN d2.EndDate ELSE d.EndDate END,
        CurrentStart = d2.StartDate
    FROM    cdata d
        INNER JOIN @Data d2
            ON (
                d.StartDate <= d2.EndDate
                AND d.EndDate >= d2.StartDate
            ) 
            AND d2.Id = d.Id
            AND d2.Type = d.Type
            AND d2.StartDate > d.CurrentStart)
SELECT cdata.Id, cdata.Type, cdata.StartDate, EndDate = MAX(cdata.EndDate) 
FROM        cdata 
GROUP BY cdata.Id, cdata.Type, cdata.StartDate

答案 2 :(得分:1)

这看起来像一个Packing Intervals问题。请参阅Itzik Ben-Gan的帖子,以获取所有详细信息以及他建议使用哪些索引来使其高效工作。他提出了一种无需递归CTE的解决方案。

两个音符。

  1. 下面的查询假定间隔为[closed;开放式),即t是包含在内的内容,而StartDate是排斥的。这种表示此类数据的方法通常是最方便的。 (在编程语言中,将数组从零开始而不是从一开始通常是更方便的。)

  2. 我添加了一个EndDate列以进行明确的排序。

样本数据

RowID

推荐索引

DECLARE @T TABLE
(
    RowID int IDENTITY,
    id int,
    StartDate date,
    EndDate date,
    tp int
);

INSERT INTO @T(Id, StartDate, EndDate, tp) VALUES
(1,  '2012-02-18',  '2012-03-18',  1),
(1,  '2012-03-17',  '2012-06-29',  1),
(1,  '2012-06-27',  '2012-09-27',  1),
(1,  '2014-08-23',  '2014-09-24',  3),
(1,  '2014-09-23',  '2014-10-24',  3),
(1,  '2014-10-23',  '2014-11-24',  3),
(2,  '2015-07-04',  '2015-08-06',  1),
(2,  '2015-08-04',  '2015-09-06',  1),
(3,  '2013-11-01',  '2013-12-01',  0),
(3,  '2018-01-09',  '2018-02-09',  0);

-- Make EndDate an opened interval, make it exclusive
-- [Start; End)
UPDATE @T
SET EndDate = DATEADD(day, 1, EndDate)
;

查询

阅读Itzik的帖子以了解发生了什么。他在那里有很好的插图。简而言之,每个时间戳(开始或结束)都被视为一个事件。每个事件的类型为-- indexes to support solutions CREATE UNIQUE INDEX idx_start_id ON T(id, tp, StartDate, RowID); CREATE UNIQUE INDEX idx_end_id ON T(id, tp, EndDate, RowID); +。每次遇到-事件(开始一定的间隔)时,我们都会增加运行计数器。每次遇到+事件(一些时间间隔结束),我们都会减少运行计数器。当运行计数器为0时,表示重叠间隔的条纹结束了。

我按原样使用Itzik的查询,只是更改了列名以匹配您的名字。

-

结果

WITH C1 AS
-- let e = end ordinals, let s = start ordinals
(
    SELECT
        RowID, id, tp, StartDate AS ts, +1 AS EventType,
        NULL AS e,
        ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY StartDate, RowID) AS s
    FROM @T

    UNION ALL

    SELECT
        RowID, id, tp, EndDate AS ts, -1 AS EventType,
        ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY EndDate, RowID) AS e,
        NULL AS s
    FROM @T
),
C2 AS
-- let se = start or end ordinal, namely, how many events (start or end) happened so far
(
    SELECT C1.*,
    ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY ts, EventType DESC, RowID) AS se
    FROM C1
),
C3 AS
-- For start events, the expression s - (se - s) - 1 represents how many sessions were active
-- just before the current (hence - 1)
--
-- For end events, the expression (se - e) - e represents how many sessions are active
-- right after this one
--
-- The above two expressions are 0 exactly when a group of packed intervals
-- either starts or ends, respectively
--
-- After filtering only events when a group of packed intervals either starts or ends,
-- group each pair of adjacent start/end events
(
    SELECT id, tp, ts,
        ((ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY ts) - 1) / 2 + 1)
        AS grpnum
    FROM C2
    WHERE COALESCE(s - (se - s) - 1, (se - e) - e) = 0
)
SELECT id, tp, MIN(ts) AS StartDate, DATEADD(day, -1, MAX(ts)) AS EndDate
FROM C3
GROUP BY id, tp, grpnum
ORDER BY id, tp, StartDate;

答案 3 :(得分:0)

create table #table
(Id int,StartDate date,  EndDate date, Type int)

insert into #table
values

('1','2012-02-18','2012-03-18','1'),('1','2012-03-19','2012-06-19','1'),
('1','2012-06-27','2012-09-27','1'),('1','2014-08-23','2014-09-24','3'),
('1','2014-09-23','2014-10-24','3'),('1','2014-10-23','2014-11-24','3'),
('2','2015-07-04','2015-08-06','1'),('2','2015-08-04','2015-09-06','1'),
('3','2013-11-01','2013-12-01','0'),('3','2018-01-09','2018-02-09','0')

select ID,MIN(startdate)sd,MAX(EndDate)ed,type from #table
group by ID,TYPE,YEAR(startdate),YEAR(EndDate)

答案 4 :(得分:0)

这可以通过使用某些窗口功能和CTE轻松实现。这是解决方法

DECLARE @table TABLE
(id        INT, 
 StartDate DATE, 
 EndDate   DATE, 
 [Type]    INT
);

INSERT INTO @table(Id,  StartDate,  EndDate,  [Type]) VALUES
(1,  '2012-02-18',  '2012-03-18',  1),
(1,  '2012-03-17',  '2012-06-29',  1),
(1,  '2012-06-27',  '2012-09-27',  1),
(1,  '2014-08-23',  '2014-09-24',  3),
(1,  '2014-09-23',  '2014-10-24',  3),
(1,  '2014-10-23',  '2014-11-24',  3),
(2,  '2015-07-04',  '2015-08-06',  1),
(2,  '2015-08-04',  '2015-09-06',  1),
(3,  '2013-11-01',  '2013-12-01',  0),
(3,  '2018-01-09',  '2018-02-09',  0);

WITH C1 AS
(
  SELECT *,
    MAX(EndDate) OVER(PARTITION BY Id, [Type]
          ORDER BY StartDate, EndDate
          ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS PrevEnd
  FROM @table
),
C2 AS
(
  SELECT *,
    SUM(StartFlag) OVER(PARTITION BY Id, [Type]
          ORDER BY StartDate, EndDate
          ROWS UNBOUNDED PRECEDING) AS GroupID
  FROM C1
    CROSS APPLY ( VALUES(CASE WHEN StartDate <= PrevEnd THEN NULL ELSE 1 END) ) AS A(StartFlag)
)
SELECT Id, [Type], MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate
FROM C2
GROUP BY Id, [Type], GroupID;