从具有排名合同的日期创建历史表(差距和群岛?)

时间:2014-04-04 05:04:15

标签: sql date teradata gaps-and-islands

我在Teradata有一个问题,我正在尝试建立一个历史合同表,列出一个系统,它的相应合同以及每个合同的开始和结束日期。然后将查询该表以作为时间点表进行报告。这里有一些代码可以更好地解释。

CREATE TABLE TMP_WORK_DB.SOLD_SYSTEMS 
(
SYSTEM_ID varchar(5),
CONTRACT_TYPE varchar(10),
CONTRACT_RANK int,
CONTRACT_STRT_DT date,
CONTRACT_END_DT date
);

INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('AAA', 'BEST', 10, '2012-01-01', '2012-06-30');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('AAA', 'BEST', 9, '2012-01-01', '2012-06-30');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('AAA', 'OK', 1, '2012-08-01', '2012-12-30');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('BBB', 'BEST', 10, '2013-12-01', '2014-03-02');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('BBB', 'BETTER', 7, '2013-12-01', '2017-03-02');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('BBB', 'GOOD', 4, '2016-12-02', '2017-12-02');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('CCC', 'BEST', 10, '2009-10-13', '2014-10-14');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('CCC', 'BETTER', 7, '2009-10-13', '2016-10-14');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS  VALUES ('CCC', 'OK', 2, '2008-10-13', '2017-10-14');

所需的输出是:

SYSTEM_ID   CONTRACT_TYPE   CONTRACT_STRT_DT    CONTARCT_END_DT     CONTRACT_RANK
  AAA          BEST            01/01/2012          06/30/2012           10
  AAA          OK              08/01/2012          12/30/2012           1
  BBB          BEST            12/01/2013          03/02/2014           10
  BBB          BETTER          03/03/2014          03/02/2017           7
  BBB          GOOD            03/03/2017          12/02/2017           4
  CCC          OK              10/13/2008          10/12/2009           2
  CCC          BEST            10/13/2009          10/14/2014           10
  CCC          BETTER          10/15/2014          10/14/2016           7
  CCC          OK              10/15/2016          10/14/2017           2

我不一定希望减少行数,但我希望在任何给定的时间点获得system_id的正确状态。请注意,当排名较高的合同结束且排名较低的合同仍处于活跃状态时,排名较低的合同会在较高排名的合同中断。

我们正在使用TD 14,并且我已经能够获得简单记录,其中日期顺序流动且排名更高但是我在两个不同排名合同涵盖多个日期跨度的重叠方面遇到了麻烦。

我发现这篇博文(Sharpening Stones)并且大部分时间都有效,但我仍然无法为重叠合约设置新的开始日期。

任何帮助将不胜感激。感谢。


* 更新04/04/2014 *

我想出了下面的代码,它给了我我想要的内容,但我不确定它的性能。它适用于几百行的较小数据集,但我还没有测试过几百万行:

* 更新04/07/2014 * 由于假脱机问题,更新了日期子查询。此查询会展开合同可能处于活动状态的所有日期,然后使用ROW_NUMBER函数获取每天最高排名的CONTRACT_TYPE。然后MIN / MAX函数在系统和合约类型上进行分区,以便在排名最高的合约类型发生变化时获取。

* 更新 - 2 - 04/07/2014 * 我清理了查询,似乎表现得更好一点。

SELECT 
    SYSTEM_ID
,   CONTRACT_TYPE
,   MIN(CALENDAR_DATE) NEW_START_DATE
,   MAX(CALENDAR_DATE) NEW_END_DATE
,   CONTRACT_RANK
FROM (
SELECT 
    CALENDAR_DATE
,   SYSTEM_ID
,   CONTRACT_TYPE
,   CONTRACT_RANK
,   ROW_NUMBER() OVER (PARTITION BY SYSTEM_ID, CALENDAR_DATE ORDER BY CONTRACT_RANK DESC, CONTRACT_STRT_DT DESC, CONTRACT_END_DT DESC) AS RNK
FROM SOLD_SYSTEMS t1
JOIN (
    SELECT CALENDAR_DATE
    FROM FULL_CALENDAR_TABLE ia     
    WHERE CALENDAR_DATE > DATE'2013-01-01'
    )dt
ON CALENDAR_DATE BETWEEN CONTRACT_STRT_DT AND CONTRACT_END_DT
QUALIFY RNK = 1
)z1
GROUP BY 1,2,5

3 个答案:

答案 0 :(得分:3)

以下方法在TD13.10中使用新的PERIOD功能。

-- 1. TD_SEQUENCED_COUNT can't be used in joins, so create a Volatile Table
-- 2. TD_SEQUENCED_COUNT can't use additional columns (e.g. CONTRACT_RANK),
--    so simply create a new row whenever a period starts or ends without
--    considering CONTRACT_RANK
CREATE VOLATILE TABLE vt AS
 (
   WITH cte
    (
      SYSTEM_ID
     ,pd
    )
   AS
    (
      SELECT
         SYSTEM_ID
-- PERIODs can easily be constructed on-the-fly, but the end date is not inclusive,
-- so I had to adjust to your implementation, CONTRACT_END_DT +/- 1:
        ,PERIOD(CONTRACT_STRT_DT, CONTRACT_END_DT + 1) AS pd
      FROM SOLD_SYSTEMS
     )
   SELECT
      SYSTEM_ID
     ,BEGIN(pd) AS CONTRACT_STRT_DT
     ,END(pd) - 1 AS CONTRACT_END_DT
   FROM
      TABLE (TD_SEQUENCED_COUNT
            (NEW VARIANT_TYPE(cte.SYSTEM_ID) 
            ,cte.pd) 
      RETURNS (SYSTEM_ID VARCHAR(5)
              ,Policy_Count INTEGER 
              ,pd PERIOD(DATE))
      HASH BY SYSTEM_ID 
      LOCAL ORDER BY SYSTEM_ID ,pd) AS dt
 )
WITH DATA 
PRIMARY INDEX (SYSTEM_ID)
ON COMMIT PRESERVE ROWS
;

-- Find the matching CONTRACT_RANK
SELECT
   vt.SYSTEM_ID
  ,t.CONTRACT_TYPE
  ,vt.CONTRACT_STRT_DT
  ,vt.CONTRACT_END_DT
  ,t.CONTRACT_RANK
FROM vt
-- If both vt and SOLD_SYSTEMS have a NUPI on SYSTEM_ID this join should be
-- quite efficient
JOIN SOLD_SYSTEMS AS t
  ON vt.SYSTEM_ID = t.SYSTEM_ID
 AND      ( t.CONTRACT_STRT_DT,  t.CONTRACT_END_DT)
 OVERLAPS (vt.CONTRACT_STRT_DT, vt.CONTRACT_END_DT)
QUALIFY 
-- As multiple contracts for the same period are possible:
-- find the row with the highest rank
   ROW_NUMBER() 
   OVER (PARTITION BY vt.SYSTEM_ID,vt.CONTRACT_STRT_DT
         ORDER BY t.CONTRACT_RANK DESC, vt.CONTRACT_END_DT DESC) = 1
ORDER BY 1,3
;

-- Previous query might return consecutive rows with the same CONTRACT_RANK, e.g.
-- BBB  BETTER  2014-03-03  2016-12-01  7
-- BBB  BETTER  2016-12-02  2017-03-02  7

-- If you don't want that you have to normalize the data:
WITH cte
 (
   SYSTEM_ID
  ,CONTRACT_STRT_DT
  ,CONTRACT_END_DT
  ,CONTRACT_RANK
  ,CONTRACT_TYPE
  ,pd
 )
AS
 (
   SELECT
      vt.SYSTEM_ID
     ,vt.CONTRACT_STRT_DT
     ,vt.CONTRACT_END_DT
     ,t.CONTRACT_RANK
     ,t.CONTRACT_TYPE
     ,PERIOD(vt.CONTRACT_STRT_DT, vt.CONTRACT_END_DT + 1) AS pd
   FROM vt
   JOIN SOLD_SYSTEMS AS t
     ON vt.SYSTEM_ID = t.SYSTEM_ID
        AND       ( t.CONTRACT_STRT_DT,  t.CONTRACT_END_DT)
         OVERLAPS (vt.CONTRACT_STRT_DT, vt.CONTRACT_END_DT)
   QUALIFY
      ROW_NUMBER() 
      OVER (PARTITION BY vt.SYSTEM_ID,vt.CONTRACT_STRT_DT
            ORDER BY t.CONTRACT_RANK DESC, vt.CONTRACT_END_DT DESC) = 1
 )
SELECT
   SYSTEM_ID
  ,CONTRACT_TYPE
  ,BEGIN(pd) AS CONTRACT_STRT_DT
  ,END(pd) - 1 AS CONTRACT_END_DT
  ,CONTRACT_RANK
FROM
   TABLE (TD_NORMALIZE_MEET
         (NEW VARIANT_TYPE(cte.SYSTEM_ID
                          ,cte.CONTRACT_RANK
                          ,cte.CONTRACT_TYPE)
         ,cte.pd) 
   RETURNS (SYSTEM_ID VARCHAR(5)
           ,CONTRACT_RANK INT
           ,CONTRACT_TYPE VARCHAR(10)
           ,pd PERIOD(DATE))
   HASH BY SYSTEM_ID 
   LOCAL ORDER BY SYSTEM_ID, CONTRACT_RANK, CONTRACT_TYPE, pd ) A
ORDER BY 1, 3;

编辑:这是获取没有易失性表和TD_SEQUENCED_COUNT的第二个查询结果的另一种方法:

SELECT
   t.SYSTEM_ID
  ,t.CONTRACT_TYPE
  ,BEGIN(CONTRACT_PERIOD)  AS CONTRACT_STRT_DT
  ,END(CONTRACT_PERIOD)- 1 AS CONTRACT_END_DT
  ,t.CONTRACT_RANK
  ,dt.p P_INTERSECT PERIOD(t.CONTRACT_STRT_DT,t.CONTRACT_END_DT + 1) AS CONTRACT_PERIOD
FROM
 (
   SELECT
      dt.SYSTEM_ID
     ,PERIOD(d, MIN(d)
                OVER (PARTITION BY dt.SYSTEM_ID
                      ORDER BY d
                      ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)) AS p
   FROM
    (
      SELECT
         SYSTEM_ID
        ,CONTRACT_STRT_DT AS d
      FROM SOLD_SYSTEMS
      UNION
      SELECT
         SYSTEM_ID
        ,CONTRACT_END_DT + 1 AS d
      FROM SOLD_SYSTEMS
    ) AS dt 
    QUALIFY p IS NOT NULL
 ) AS dt
JOIN SOLD_SYSTEMS AS t
  ON dt.SYSTEM_ID = t.SYSTEM_ID
WHERE CONTRACT_PERIOD IS NOT NULL
QUALIFY 
   ROW_NUMBER() 
   OVER (PARTITION BY dt.SYSTEM_ID,p
         ORDER BY t.CONTRACT_RANK DESC, t.CONTRACT_END_DT DESC) = 1
ORDER BY 1,3

基于此,您还可以在单​​个查询中包含规范化:

WITH cte
 (
   SYSTEM_ID
  ,CONTRACT_TYPE
  ,CONTRACT_STRT_DT
  ,CONTRACT_END_DT
  ,CONTRACT_RANK
  ,pd
 )
AS
 (
   SELECT
      t.SYSTEM_ID
     ,t.CONTRACT_TYPE
     ,BEGIN(CONTRACT_PERIOD)  AS CONTRACT_STRT_DT
     ,END(CONTRACT_PERIOD)- 1 AS CONTRACT_END_DT
     ,t.CONTRACT_RANK
     ,dt.p P_INTERSECT PERIOD(t.CONTRACT_STRT_DT,t.CONTRACT_END_DT + 1) AS CONTRACT_PERIOD
   FROM
    (
      SELECT
         dt.SYSTEM_ID
        ,PERIOD(d, MIN(d)
                   OVER (PARTITION BY dt.SYSTEM_ID
                         ORDER BY d
                         ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)) AS p
      FROM
       (
         SELECT
            SYSTEM_ID
           ,CONTRACT_STRT_DT AS d
         FROM SOLD_SYSTEMS
         UNION
         SELECT
            SYSTEM_ID
           ,CONTRACT_END_DT + 1 AS d
         FROM SOLD_SYSTEMS
       ) AS dt 
       QUALIFY p IS NOT NULL
    ) AS dt
   JOIN SOLD_SYSTEMS AS t
     ON dt.SYSTEM_ID = t.SYSTEM_ID
   WHERE CONTRACT_PERIOD IS NOT NULL
   QUALIFY 
      ROW_NUMBER() 
      OVER (PARTITION BY dt.SYSTEM_ID,p
            ORDER BY t.CONTRACT_RANK DESC, t.CONTRACT_END_DT DESC) = 1
 )
SELECT
   SYSTEM_ID
  ,CONTRACT_TYPE
  ,BEGIN(pd) AS CONTRACT_STRT_DT
  ,END(pd) - 1 AS CONTRACT_END_DT
  ,CONTRACT_RANK
FROM 
   TABLE (TD_NORMALIZE_MEET
         (NEW VARIANT_TYPE(cte.SYSTEM_ID
                          ,cte.CONTRACT_RANK
                          ,cte.CONTRACT_TYPE)
         ,cte.pd) 
   RETURNS (SYSTEM_ID VARCHAR(5)
           ,CONTRACT_RANK INT
           ,CONTRACT_TYPE VARCHAR(10)
           ,pd PERIOD(DATE))
   HASH BY SYSTEM_ID 
   LOCAL ORDER BY SYSTEM_ID, CONTRACT_RANK, CONTRACT_TYPE, pd ) A
ORDER BY 1, 3;

答案 1 :(得分:0)

SEL system_id,contract_type,MAX(contract_rank),
CASE    WHEN contract_strt_dt<prev_end_dt THEN prev_end_dt+1 
ELSE    contract_strt_dt 
END AS new_start ,contract_strt_dt,contract_end_dt,
MIN(contract_end_dt) OVER (PARTITION BY system_id  
ORDER   BY contract_strt_dt,contract_end_dt ROWS BETWEEN 1 PRECEDING 
AND 1 PRECEDING) prev_end_dt

FROM sold_systems
GROUP BY system_id,contract_type,contract_strt_dt,contract_end_dt
ORDER   BY contract_strt_dt,contract_end_dt,prev_end_dt

答案 2 :(得分:0)

我想我得到了...... 试试这个

select  SYSTEM_ID, CONTRACT_TYPE,CONTRACT_RANK,
case
    when CONTRACT_STRT_DT<NEW_START_DATE then NEW_START_DATE  /*if new_star_date overlap startdate then get new_Start_date */
    else CONTRACT_STRT_DT
    end as new_contract_str_dt,
     CONTRACT_END_DT
from
(select t1.SYSTEM_ID,t1.CONTRACT_TYPE,t1.CONTRACT_RANK,t1.CONTRACT_STRT_DT,t1.CONTRACT_END_DT,
 coalesce(max(t1.CONTRACT_END_DT) over (partition by t1.SYSTEM_ID  order by t1.CONTRACT_RANK desc rows between UNBOUNDED PRECEDING and 1 preceding  ), t1.CONTRACT_STRT_DT)  NEW_START_DATE
from SOLD_SYSTEMS t1 

) as  temp1
/*you may remove fully overlapped contracts*/
where  NEW_START_DATE<=CONTRACT_END_DT

它更简单,并且有一个很好的执行计划......你可以使用大型表(不要忘记收集统计数据)