Question

我在Hive中有一个65M〜记录表，其中包含患者，设施，服务开始和服务结束日期。该表与下面的MWE相似：

CREATE TABLE <your_db>.example
(accountId string,
 provider string,
 startdate timestamp,
 enddate timestamp);

INSERT INTO TABLE <your_db>.example VALUES
('123A', 'smith', '2019-03-01 00:00:00', '2019-03-04 00:00:00'),
('456B', 'rogers', '2019-03-02 00:00:00', '2019-03-03 00:00:00'),
('123A', 'smith', '2019-03-03 00:00:00', '2019-03-06 00:00:00'),
('123A', 'smith', '2019-03-07 00:00:00', '2019-03-08 00:00:00'),
('456B', 'daniels', '2019-03-04 00:00:00', '2019-03-05 00:00:00'),
('456B', 'daniels', '2019-03-06 00:00:00', '2019-03-09 00:00:00'),
('123A', 'smith', '2019-03-10 00:00:00', '2019-03-12 00:00:00');


SELECT * FROM <your_db>.example;
#   example.accountid  example.provider example.startdate       example.enddate
#1  123A               smith            2019-03-01 00:00:00.0   2019-03-04 00:00:00.0
#2  456B               rogers           2019-03-02 00:00:00.0   2019-03-03 00:00:00.0
#3  123A               smith            2019-03-03 00:00:00.0   2019-03-06 00:00:00.0
#4  123A               smith            2019-03-07 00:00:00.0   2019-03-08 00:00:00.0
#5  456B               daniels          2019-03-04 00:00:00.0   2019-03-05 00:00:00.0
#6  456B               daniels          2019-03-06 00:00:00.0   2019-03-09 00:00:00.0
#7  123A               smith            2019-03-10 00:00:00.0   2019-03-12 00:00:00.0

我想为startdate和enddate的组合定义连续的accountId和provider，其中记录的enddate之间的间隔不超过1天然后是下一个记录的startdate，然后计算连续块中的天数（停留时间称为“ los”）。这种分组称为“案例”。下面是案例输出的外观：

#   results.accountid  results.provider results.los results.startdate       results.enddate
#1  123A               smith            7           2019-03-01 00:00:00.0   2019-03-08 00:00:00.0
#2  456B               rogers           1           2019-03-02 00:00:00.0   2019-03-03 00:00:00.0
#3  456B               daniels          5           2019-03-04 00:00:00.0   2019-03-09 00:00:00.0
#4  123A               smith            2           2019-03-10 00:00:00.0   2019-03-12 00:00:00.0

我们当前正在使用对this question的可接受的答案，但是对于我们的实际（65M条记录）表，这成为非常昂贵的操作。我认为，更有效的解决方案是先合并并定义每个案例的startdate和enddate，然后运行datediff计算（而不是展开每个日期范围），但我不确定如何在HiveQL中实现这一目标。

谢谢！

Answer 1

浏览我们公司的仓库后，我发现下面的创意解决方案可以满足我们的需求。尚未测试其在当前“爆炸式”解决方案上的性能改进。它确实满足了我在原始问题中的要求，但是有点复杂（尽管评论不错）。

/*
STEP 1: Input
*/

DROP TABLE IF EXISTS <your_db>.tmp_completedatepairs;

CREATE TABLE AS <your_db>.tmp_completedatepairs AS
SELECT CONCAT(isnull(accountid, ''), "-", isnull(provider, '')) AS tag
     , startdate
     , enddate
  FROM <your_db>.example
 WHERE startdate IS NOT NULL 
       AND enddate IS NOT NULL;

/*
STEP 2: Create new pairs of start and end dates that are 
better time span tiles across the stay period 
*/

DROP TABLE IF EXISTS <your_db>.tmp_respaned_input;

CREATE TABLE <your_db>.tmp_respaned_input AS
SELECT SD.tag
     , SD.startdate
     , ED.enddate
  FROM (SELECT *
             , ROW_NUMBER() OVER (PARTITION BY tag ORDER BY startdate ASC) AS rnsd
          FROM <your_db>.tmp_completedatepairs) AS SD
       LEFT JOIN
       (SELECT *
             , ROW_NUMBER() OVER (PARTITION BY tag ORDER BY enddate ASC) AS rned
          FROM <your_db>.tmp_completedatepairs) AS ED
       ON SD.tag=ED.tag
          AND SD.rnsd=ED.rned;

/*
STEP 3: Find gaps >1day and define stays around them
This consists of several substeps:
(a) Isolate all start dates that are more than 1 day after a preceding start date with the same tag, or are the earliest date for the tag. Number them in order.
(b) Isolate all end dates that are more than 1 day before a following end date with the same tag, or are the last date for the tag. Number them in order.
(c) Match together corresponding start and end dates after SELECTing only those dates that terminate a case (rather than dates that occur within case boundaries)
*/

DROP TABLE IF EXISTS <your_db>.results;

CREATE TABLE <your_db>.resuts AS
-- (c) Match together corresponding start and end dates after SELECTing only those dates that terminate a case (rather than dates that occur within case boundaries)
SELECT SPLIT(tag,'-')[0] AS accountid
     , SPLIT(tag,'-')[1] AS provider
     , DATEDIFF(enddate, startdate) AS los
     , startdate
     , enddate
  FROM
       -- (a) Isolate all start dates that are more than 1 day after a preceding end date with the same tag, or are the earliest date for the tag. Number them in order.
       (SELECT tag
             , startdate
             , CONCAT(tag, CAST(ROW_NUMBER() OVER (PARTITION BY tag ORDER BY startdate ASC) AS string)) AS rnlink
          FROM (SELECT L.tag
                     , L.startdate AS startdate
                     , DATEDIFF(L.startdate, R.enddate) AS d
                  FROM (SELECT *
                             , CONCAT(tag, CAST(ROW_NUMBER() OVER (PARTITION BY tag ORDER BY startdate ASC) AS string)) AS rnstart
                          FROM <your_db>.tmp_respaned_input) L
                       LEFT JOIN
                       (SELECT *
                             , CONCAT(tag, CAST(ROW_NUMBER() OVER (PARTITION BY tag ORDER BY enddate ASC) + 1 AS string)) AS rnstart
                          FROM <your_db>.tmp_respaned_input) R
                       ON L.rnstart = R.rnstart) X
         WHERE d > 1 OR d IS NULL) S

       LEFT JOIN
       -- (b) Isolate all end dates that are more than 1 day before a following start date with the same tag, or are the last date for the tag. Number them in order.
       (SELECT  enddate
             , CONCAT(tag, CAST(row_number() over (PARTITION BY tag ORDER BY enddate ASC) AS string)) AS rnlink
          FROM (SELECT L.tag
                     , L.enddate AS enddate
                     , DATEDIFF(R.startdate, L.enddate) AS d
                  FROM (SELECT *
                             , CONCAT(tag, CAST(row_number() over (PARTITION BY tag ORDER BY enddate ASC) AS string)) AS rnend
                          FROM <your_db>.tmp_respaned_input) L
                       LEFT JOIN
                       (SELECT *
                             , CONCAT(tag, CAST(row_number() over (PARTITION BY tag ORDER BY startdate ASC) - 1 AS string)) AS rnend
                          FROM <your_db>.tmp_respaned_input) R
                       ON L.rnend = R.rnend) X
         WHERE d > 1 or d IS NULL) E
       ON S.rnlink = E.rnlink;



-- Print results
SELECT *
  FROM <your_db>.results
 ORDER BY startdate ASC;

#   results.accountid  results.provider results.los results.startdate       results.enddate
#1  123A               smith            7           2019-03-01 00:00:00.0   2019-03-08 00:00:00.0
#2  456B               rogers           1           2019-03-02 00:00:00.0   2019-03-03 00:00:00.0
#3  456B               daniels          5           2019-03-04 00:00:00.0   2019-03-09 00:00:00.0
#4  123A               smith            2           2019-03-10 00:00:00.0   2019-03-12 00:00:00.0

Answer 2

这是我的解决方案，请查看代码中的注释：

--configuration
set hive.cli.print.header=true;
set hive.execution.engine=tez;
set hive.mapred.reduce.tasks.speculative.execution=false;
set mapred.reduce.tasks.speculative.execution=false;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=36;
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled=true;
set hive.vectorized.execution.reduce.groupby.enabled=true;
set hive.map.aggr=true;

with example as (--this is your data example
select stack (9, '123A', 'smith', '2019-03-01 00:00:00', '2019-03-04 00:00:00',
'456B', 'rogers', '2019-03-02 00:00:00', '2019-03-03 00:00:00',
'123A', 'smith', '2019-03-03 00:00:00', '2019-03-06 00:00:00',
'123A', 'smith', '2019-03-07 00:00:00', '2019-03-08 00:00:00',
'456B', 'daniels', '2019-03-04 00:00:00', '2019-03-05 00:00:00',
'456B', 'daniels', '2019-03-06 00:00:00', '2019-03-09 00:00:00',
'123A', 'smith', '2019-03-10 00:00:00', '2019-03-12 00:00:00',
--I added one more case
'123A', 'smith', '2019-03-14 00:00:00', '2019-03-17 00:00:00',
'123A', 'smith', '2019-03-18 00:00:00', '2019-03-19 00:00:00'
) as (accountId, provider, startdate, enddate )
)

select --aggregate start and end dates for the whole case, count LOS 
       accountId, provider, datediff(max(enddate),min(startdate)) as los, min(startdate) startdate , max(enddate) enddate
from
(
select --distribute case_id across all records in the same case
       accountId, provider, startdate, enddate,
       last_value(case_id, true) over(partition by accountid, same_case_flag order by startdate ) as case_id --Bingo!!! we have case_id     
from
(
select --generate UUID as case_id if previous same_case_flag != current one or previous was NULL. 
       --One UUID will be generated for each new case
       accountId, provider, startdate, enddate, same_case_flag, 
       case when lag(same_case_flag) over(partition by accountid order by startdate) = same_case_flag 
              then NULL else java_method("java.util.UUID", "randomUUID") 
        end case_id      
from
(
select --calculate same case flag
       accountId, provider, startdate, enddate,
        case when  datediff(startdate,lag(enddate) over(partition by accountId order by startdate)) <=1    --startdate - prev_enddate                          
                   OR  
                   datediff(lead(startdate) over(partition by accountId order by startdate), enddate) <=1  --next_startdate-enddate
                then true else false 
         end as same_case_flag                  
  from example s
)s)s)s
group by accountId, provider, case_id
order by startdate;  --remove order by if not necessary to sppeed-up processing !!! I added it to get the same ordering as in your example

结果：

--------------------------------------------------------------------------------
        VERTICES      STATUS  TOTAL  COMPLETED  RUNNING  PENDING  FAILED  KILLED
--------------------------------------------------------------------------------
Map 1 ..........   SUCCEEDED      1          1        0        0       0       0
Reducer 2 ......   SUCCEEDED      1          1        0        0       0       0
Reducer 3 ......   SUCCEEDED      1          1        0        0       0       0
Reducer 4 ......   SUCCEEDED      1          1        0        0       0       0
Reducer 5 ......   SUCCEEDED      1          1        0        0       0       0
Reducer 6 ......   SUCCEEDED      1          1        0        0       0       0
--------------------------------------------------------------------------------
VERTICES: 06/06  [==========================>>] 100%  ELAPSED TIME: 10.79 s
--------------------------------------------------------------------------------
OK
accountid       provider        los     startdate       enddate
123A    smith   7       2019-03-01 00:00:00     2019-03-08 00:00:00
456B    rogers  1       2019-03-02 00:00:00     2019-03-03 00:00:00
456B    daniels 5       2019-03-04 00:00:00     2019-03-09 00:00:00
123A    smith   2       2019-03-10 00:00:00     2019-03-12 00:00:00
123A    smith   5       2019-03-14 00:00:00     2019-03-19 00:00:00
Time taken: 29.049 seconds, Fetched: 5 row(s)

删除订单以摆脱最后的减速器。

根据您的日期，可能要分配case_id，您可以使用concat(accountid, rand())或concat也可以使用startdate，或者类似的方法，而不要使用randomUUID，如果后面的个案具有相同的accountid，但是randomUUID是更安全，因为它始终是唯一的。

这种方法根本不使用联接。

Hive中的有效日期案例逻辑

2 个答案: