我在Hive中有一个65M〜记录表,其中包含患者,设施,服务开始和服务结束日期。该表与下面的MWE相似:
CREATE TABLE <your_db>.example
(accountId string,
provider string,
startdate timestamp,
enddate timestamp);
INSERT INTO TABLE <your_db>.example VALUES
('123A', 'smith', '2019-03-01 00:00:00', '2019-03-04 00:00:00'),
('456B', 'rogers', '2019-03-02 00:00:00', '2019-03-03 00:00:00'),
('123A', 'smith', '2019-03-03 00:00:00', '2019-03-06 00:00:00'),
('123A', 'smith', '2019-03-07 00:00:00', '2019-03-08 00:00:00'),
('456B', 'daniels', '2019-03-04 00:00:00', '2019-03-05 00:00:00'),
('456B', 'daniels', '2019-03-06 00:00:00', '2019-03-09 00:00:00'),
('123A', 'smith', '2019-03-10 00:00:00', '2019-03-12 00:00:00');
SELECT * FROM <your_db>.example;
# example.accountid example.provider example.startdate example.enddate
#1 123A smith 2019-03-01 00:00:00.0 2019-03-04 00:00:00.0
#2 456B rogers 2019-03-02 00:00:00.0 2019-03-03 00:00:00.0
#3 123A smith 2019-03-03 00:00:00.0 2019-03-06 00:00:00.0
#4 123A smith 2019-03-07 00:00:00.0 2019-03-08 00:00:00.0
#5 456B daniels 2019-03-04 00:00:00.0 2019-03-05 00:00:00.0
#6 456B daniels 2019-03-06 00:00:00.0 2019-03-09 00:00:00.0
#7 123A smith 2019-03-10 00:00:00.0 2019-03-12 00:00:00.0
我想为startdate
和enddate
的组合定义连续的accountId
和provider
,其中记录的enddate
之间的间隔不超过1天然后是下一个记录的startdate
,然后计算连续块中的天数(停留时间称为“ los”)。这种分组称为“案例”。下面是案例输出的外观:
# results.accountid results.provider results.los results.startdate results.enddate
#1 123A smith 7 2019-03-01 00:00:00.0 2019-03-08 00:00:00.0
#2 456B rogers 1 2019-03-02 00:00:00.0 2019-03-03 00:00:00.0
#3 456B daniels 5 2019-03-04 00:00:00.0 2019-03-09 00:00:00.0
#4 123A smith 2 2019-03-10 00:00:00.0 2019-03-12 00:00:00.0
我们当前正在使用对this question的可接受的答案,但是对于我们的实际(65M条记录)表,这成为非常昂贵的操作。我认为,更有效的解决方案是先合并并定义每个案例的startdate
和enddate
,然后运行datediff
计算(而不是展开每个日期范围),但我不确定如何在HiveQL中实现这一目标。
谢谢!
答案 0 :(得分:1)
浏览我们公司的仓库后,我发现下面的创意解决方案可以满足我们的需求。尚未测试其在当前“爆炸式”解决方案上的性能改进。它确实满足了我在原始问题中的要求,但是有点复杂(尽管评论不错)。
/*
STEP 1: Input
*/
DROP TABLE IF EXISTS <your_db>.tmp_completedatepairs;
CREATE TABLE AS <your_db>.tmp_completedatepairs AS
SELECT CONCAT(isnull(accountid, ''), "-", isnull(provider, '')) AS tag
, startdate
, enddate
FROM <your_db>.example
WHERE startdate IS NOT NULL
AND enddate IS NOT NULL;
/*
STEP 2: Create new pairs of start and end dates that are
better time span tiles across the stay period
*/
DROP TABLE IF EXISTS <your_db>.tmp_respaned_input;
CREATE TABLE <your_db>.tmp_respaned_input AS
SELECT SD.tag
, SD.startdate
, ED.enddate
FROM (SELECT *
, ROW_NUMBER() OVER (PARTITION BY tag ORDER BY startdate ASC) AS rnsd
FROM <your_db>.tmp_completedatepairs) AS SD
LEFT JOIN
(SELECT *
, ROW_NUMBER() OVER (PARTITION BY tag ORDER BY enddate ASC) AS rned
FROM <your_db>.tmp_completedatepairs) AS ED
ON SD.tag=ED.tag
AND SD.rnsd=ED.rned;
/*
STEP 3: Find gaps >1day and define stays around them
This consists of several substeps:
(a) Isolate all start dates that are more than 1 day after a preceding start date with the same tag, or are the earliest date for the tag. Number them in order.
(b) Isolate all end dates that are more than 1 day before a following end date with the same tag, or are the last date for the tag. Number them in order.
(c) Match together corresponding start and end dates after SELECTing only those dates that terminate a case (rather than dates that occur within case boundaries)
*/
DROP TABLE IF EXISTS <your_db>.results;
CREATE TABLE <your_db>.resuts AS
-- (c) Match together corresponding start and end dates after SELECTing only those dates that terminate a case (rather than dates that occur within case boundaries)
SELECT SPLIT(tag,'-')[0] AS accountid
, SPLIT(tag,'-')[1] AS provider
, DATEDIFF(enddate, startdate) AS los
, startdate
, enddate
FROM
-- (a) Isolate all start dates that are more than 1 day after a preceding end date with the same tag, or are the earliest date for the tag. Number them in order.
(SELECT tag
, startdate
, CONCAT(tag, CAST(ROW_NUMBER() OVER (PARTITION BY tag ORDER BY startdate ASC) AS string)) AS rnlink
FROM (SELECT L.tag
, L.startdate AS startdate
, DATEDIFF(L.startdate, R.enddate) AS d
FROM (SELECT *
, CONCAT(tag, CAST(ROW_NUMBER() OVER (PARTITION BY tag ORDER BY startdate ASC) AS string)) AS rnstart
FROM <your_db>.tmp_respaned_input) L
LEFT JOIN
(SELECT *
, CONCAT(tag, CAST(ROW_NUMBER() OVER (PARTITION BY tag ORDER BY enddate ASC) + 1 AS string)) AS rnstart
FROM <your_db>.tmp_respaned_input) R
ON L.rnstart = R.rnstart) X
WHERE d > 1 OR d IS NULL) S
LEFT JOIN
-- (b) Isolate all end dates that are more than 1 day before a following start date with the same tag, or are the last date for the tag. Number them in order.
(SELECT enddate
, CONCAT(tag, CAST(row_number() over (PARTITION BY tag ORDER BY enddate ASC) AS string)) AS rnlink
FROM (SELECT L.tag
, L.enddate AS enddate
, DATEDIFF(R.startdate, L.enddate) AS d
FROM (SELECT *
, CONCAT(tag, CAST(row_number() over (PARTITION BY tag ORDER BY enddate ASC) AS string)) AS rnend
FROM <your_db>.tmp_respaned_input) L
LEFT JOIN
(SELECT *
, CONCAT(tag, CAST(row_number() over (PARTITION BY tag ORDER BY startdate ASC) - 1 AS string)) AS rnend
FROM <your_db>.tmp_respaned_input) R
ON L.rnend = R.rnend) X
WHERE d > 1 or d IS NULL) E
ON S.rnlink = E.rnlink;
-- Print results
SELECT *
FROM <your_db>.results
ORDER BY startdate ASC;
# results.accountid results.provider results.los results.startdate results.enddate
#1 123A smith 7 2019-03-01 00:00:00.0 2019-03-08 00:00:00.0
#2 456B rogers 1 2019-03-02 00:00:00.0 2019-03-03 00:00:00.0
#3 456B daniels 5 2019-03-04 00:00:00.0 2019-03-09 00:00:00.0
#4 123A smith 2 2019-03-10 00:00:00.0 2019-03-12 00:00:00.0
答案 1 :(得分:1)
这是我的解决方案,请查看代码中的注释:
--configuration
set hive.cli.print.header=true;
set hive.execution.engine=tez;
set hive.mapred.reduce.tasks.speculative.execution=false;
set mapred.reduce.tasks.speculative.execution=false;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=36;
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled=true;
set hive.vectorized.execution.reduce.groupby.enabled=true;
set hive.map.aggr=true;
with example as (--this is your data example
select stack (9, '123A', 'smith', '2019-03-01 00:00:00', '2019-03-04 00:00:00',
'456B', 'rogers', '2019-03-02 00:00:00', '2019-03-03 00:00:00',
'123A', 'smith', '2019-03-03 00:00:00', '2019-03-06 00:00:00',
'123A', 'smith', '2019-03-07 00:00:00', '2019-03-08 00:00:00',
'456B', 'daniels', '2019-03-04 00:00:00', '2019-03-05 00:00:00',
'456B', 'daniels', '2019-03-06 00:00:00', '2019-03-09 00:00:00',
'123A', 'smith', '2019-03-10 00:00:00', '2019-03-12 00:00:00',
--I added one more case
'123A', 'smith', '2019-03-14 00:00:00', '2019-03-17 00:00:00',
'123A', 'smith', '2019-03-18 00:00:00', '2019-03-19 00:00:00'
) as (accountId, provider, startdate, enddate )
)
select --aggregate start and end dates for the whole case, count LOS
accountId, provider, datediff(max(enddate),min(startdate)) as los, min(startdate) startdate , max(enddate) enddate
from
(
select --distribute case_id across all records in the same case
accountId, provider, startdate, enddate,
last_value(case_id, true) over(partition by accountid, same_case_flag order by startdate ) as case_id --Bingo!!! we have case_id
from
(
select --generate UUID as case_id if previous same_case_flag != current one or previous was NULL.
--One UUID will be generated for each new case
accountId, provider, startdate, enddate, same_case_flag,
case when lag(same_case_flag) over(partition by accountid order by startdate) = same_case_flag
then NULL else java_method("java.util.UUID", "randomUUID")
end case_id
from
(
select --calculate same case flag
accountId, provider, startdate, enddate,
case when datediff(startdate,lag(enddate) over(partition by accountId order by startdate)) <=1 --startdate - prev_enddate
OR
datediff(lead(startdate) over(partition by accountId order by startdate), enddate) <=1 --next_startdate-enddate
then true else false
end as same_case_flag
from example s
)s)s)s
group by accountId, provider, case_id
order by startdate; --remove order by if not necessary to sppeed-up processing !!! I added it to get the same ordering as in your example
结果:
--------------------------------------------------------------------------------
VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
--------------------------------------------------------------------------------
Map 1 .......... SUCCEEDED 1 1 0 0 0 0
Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0
Reducer 3 ...... SUCCEEDED 1 1 0 0 0 0
Reducer 4 ...... SUCCEEDED 1 1 0 0 0 0
Reducer 5 ...... SUCCEEDED 1 1 0 0 0 0
Reducer 6 ...... SUCCEEDED 1 1 0 0 0 0
--------------------------------------------------------------------------------
VERTICES: 06/06 [==========================>>] 100% ELAPSED TIME: 10.79 s
--------------------------------------------------------------------------------
OK
accountid provider los startdate enddate
123A smith 7 2019-03-01 00:00:00 2019-03-08 00:00:00
456B rogers 1 2019-03-02 00:00:00 2019-03-03 00:00:00
456B daniels 5 2019-03-04 00:00:00 2019-03-09 00:00:00
123A smith 2 2019-03-10 00:00:00 2019-03-12 00:00:00
123A smith 5 2019-03-14 00:00:00 2019-03-19 00:00:00
Time taken: 29.049 seconds, Fetched: 5 row(s)
删除订单以摆脱最后的减速器。
根据您的日期,可能要分配case_id,您可以使用concat(accountid, rand())
或concat也可以使用startdate,或者类似的方法,而不要使用randomUUID
,如果后面的个案具有相同的accountid,但是randomUUID是更安全,因为它始终是唯一的。
这种方法根本不使用联接。