SQL Split Island On Criteria

时间:2017-12-28 21:07:48

标签: sql sql-server gaps-and-islands

我有一个SQL表,FromTo日期如下:

Row    From                    To 
--------------------------------------------------
1      2017-10-28 00:00:00     2017-10-30 00:00:00
2      2017-10-30 00:00:00     2017-10-31 00:00:00
3      2017-10-31 00:00:00     2017-10-31 07:30:00
4      2017-10-31 14:41:00     2017-10-31 15:14:00
5      2017-10-31 17:13:00     2017-11-01 00:00:00
6      2017-11-01 00:00:00     2017-11-01 23:45:00
7      2017-11-02 03:13:00     2017-11-02 07:56:00

我需要将连续数据分组到岛屿中。数据不重叠。使用此查询可以轻松完成此操作:

;with Islands as
(
    SELECT
        min([From]) as [From]
        ,max([To]) as [To]
    FROM
    (
        select
            [From],
            [To],
            sum(startGroup) over (order by [From]) StartGroup
        from 
        (       
            SELECT 
                [From],
                [To],
                (case when [From] <= lag([To]) over (order by [From])
                                then 0
                                else 1
                        end) as StartGroup
                FROM dbo.DateTable
        ) IsNewIsland
    ) GroupedIsland
    group by StartGroup
)
select *
from Islands

并给我这些结果:

From                    To                 Rows 
-----------------------------------------------------
2017-10-28 00:00:00     2017-10-31 07:30:00      1-3
2017-10-31 14:41:00     2017-10-31 15:14:00      4
2017-10-31 17:13:00     2017-11-01 23:45:00      5-6
2017-11-02 03:13:00     2017-11-02 07:56:00      7

我遇到的问题是,一旦他们获得了足够的记录到一定的总持续时间,我需要修改查询以限制/拆分岛屿。这是输入/硬编码值。拆分包括整个记录,而不是在记录From-To范围的中间分割。例如,我需要将岛屿分成 27 小时。这会得到这个结果:

From                    To                       Rows 
-----------------------------------------------------
2017-10-29 00:00:00     2017-10-30 00:00:00      1
2017-10-30 00:00:00     2017-10-31 07:30:00      2-3
2017-10-31 17:13:00     2017-11-01 23:45:00      5-6

第一个岛屿被拆分,因为第1和第2行仅创造了27个小时的时间段。第4行和第7行不足以创建一个岛屿,因此它们会被忽略。

我尝试通过内部选择中的lag函数来提取此信息,以计算跨行的“滚动持续时间”,但它不适用于跨越2行的岛屿,因为它只跟踪最后一行行的持续时间,我无法“携带”计算。

SELECT 
  [From],
  [To],
  (case when [From] <= lag([To]) over (order by [From]
                            then (datediff(minute, [From], [To]) + lag(datediff(minute, [From], [To])) over (order by [From]))
                            else datediff(minute, [From], [To])
                    end) as RollingDuration,
  (case when [From] <= lag([To]) over (order by [From])
                      then 0
                      else 1
          end) as StartGroup
FROM dbo.DateTable

1 个答案:

答案 0 :(得分:1)

&#34;最差的&#34;我能想到这样做的方式是一个古怪的更新&#34;。 (谷歌,老实说,我没有做到。)


使用它我可以在有间隙或总计达到27小时的情况下开始一个新组。然后照常进行。

-- New table to work through
----------------------------------------------------------------------
-- Addition [group_start] field (identifies groups, and useful data)
-- PRIMARY KEY CLUSTERED to enforce the order rows will be processed
----------------------------------------------------------------------

CREATE TABLE sample (
    id             INT,
    start          DATETIME,
    cease          DATETIME,
    group_start    DATETIME   DEFAULT(0),
    PRIMARY KEY CLUSTERED (group_start, start)   -- To force the order we will iterate the rows, and is useful in last step
);

INSERT INTO
    sample (
        id,
        start,
        cease
    )
VALUES
    (1,      '2017-10-28 00:00:00',     '2017-10-30 00:00:00'),
    (2,      '2017-10-30 00:00:00',     '2017-10-31 00:00:00'),
    (3,      '2017-10-31 00:00:00',     '2017-10-31 07:30:00'),
    (4,      '2017-10-31 14:41:00',     '2017-10-31 15:14:00'),
    (5,      '2017-10-31 17:13:00',     '2017-11-01 00:00:00'),
    (6,      '2017-11-01 00:00:00',     '2017-11-01 23:45:00'),
    (7,      '2017-11-02 03:13:00',     '2017-11-02 07:56:00')
;


-- Quirky Update
----------------------------------------------------------------------
-- Update [group_start] to the start of the current group
-- -> new group if gap since previous row
-- -> new group if previous row took group to 27 hours
-- -> else same group as previous row
----------------------------------------------------------------------

DECLARE @grp_start DATETIME = 0;

WITH
    lagged AS
(
    SELECT *, LAG(cease) OVER (ORDER BY group_start, start) AS lag_cease FROM sample
)
UPDATE
    lagged
SET
    @grp_start
        = group_start
            = CASE WHEN start <> lag_cease                     THEN start
                   WHEN start >= DATEADD(hour, 27, @grp_start) THEN start
                                                               ELSE @grp_start END
OPTION
    (MAXDOP 1)
;

-- Standard SQL to apply other logic
----------------------------------------------------------------------
-- MAX() OVER () to find end time of each group
-- WHERE to filter out any groups under 12 hours long
----------------------------------------------------------------------

SELECT
    *
FROM
(
    SELECT
        *,
        MAX(cease) OVER (PARTITION BY group_start)    AS group_cease
    FROM
        sample
)
   bounded_groups
WHERE
   group_cease >= DATEADD(hour, 12, group_start)
;

http://dbfiddle.uk/?rdbms=sqlserver_2017&fiddle=1bec5b3fe920c1affd58f23a11e280a0