蜂巢窗口查询

时间:2018-10-26 18:39:00

标签: hive hiveql

我有一个具有以下架构的基本Hive表: This is my base table

我想要以下输出: This is the desired output

因此,基本上,将所有列分组,并计算该月和过去3个月(包括该月)中不同的Encounters的计数。 例如,对于DischargeMonthYear 2018年1月 num_discharges_last_30_days 是在2018年1月(3)出院的患者,而 num_discharges_last_90_days 是在11月出院的患者。 -17、12月17日和1月18日。由于在这种情况下1月18日之前没有数据,因此两个计数相同。 同样,对于3月18日, num_discharges_last_90_days 应该包括1月,2月和3月18日的计数(3 + 2 + 2 = 7)。 对于6月18日,由于我们没有4月和5月18日的数据,因此应仅包含6月18日的计数,而不能包含先前的分组/分区。

我有以下查询,为我提供了正确的总数,总计为 num_discharges_last_90_days ,直到6月18日,但不遵循前面各列的分组;对于7月18日,它还包括6月18日的总数并非如此,因为区域不同。

如果我为其添加PARTITION BY地区(和其他)子句,则 num_discharges_last_90_days 现在对7月18日正确,但对6月18日不正确,因为它包括2月和18日的总数

`

DROP TABLE IF EXISTS Encounter;
CREATE TEMPORARY TABLE Encounter
(
       Encounter_no int,
       Admit_date date,
       discharge_date date,
       region varchar(50),
       Facilityname varchar(50),
       Payertype varchar(10),
       Payernamme varchar(20),
       patient_type varchar(10)
);

INSERT INTO Encounter
select 12345, '2018-01-01', '2018-01-05', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12346, '2018-01-02', '2018-01-06', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12347, '2018-01-03', '2018-01-07', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12348, '2018-02-04', '2018-02-08', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12349, '2018-02-05', '2018-02-09', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12350, '2018-03-06', '2018-03-10', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12351, '2018-03-07', '2018-03-11', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12352, '2018-06-08', '2018-06-12', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12353, '2018-06-09', '2018-06-13', 'Midwest', 'ABC', 'MCR', 'MCR123', 'IP' union all
select 12354, '2018-07-10', '2018-07-14', 'NorthEast', 'ABC', 'MCR', 'MCR123', 'IP'
;
--SELECT from_unixtime(unix_timestamp(e.discharge_date, 'yyyy-MM-dd'),'MM') AS `Discharge_Month` FROM Encounter e

--Below CTE is used to get all month numbers
WITH R AS
(
    SELECT '01' AS MonthNum
    UNION ALL SELECT '02'
    UNION ALL SELECT '03'
    UNION ALL SELECT '04'
    UNION ALL SELECT '05'
    UNION ALL SELECT '06'
    UNION ALL SELECT '07'
    UNION ALL SELECT '08'
    UNION ALL SELECT '09'
    UNION ALL SELECT '10'
    UNION ALL SELECT '11'
    UNION ALL SELECT '12'
)
SELECT *  FROM
(
    --Perform a left join on CTE with your query to get all months
    SELECT 
    R.MonthNum,
    e.region,
    e.facilityname,
    from_unixtime(unix_timestamp(e.discharge_date, 'yyyy-MM-dd'),'MMM-yyyy') AS Discharge_Month,
    e.Payertype,
    e.Payernamme,
    e.patient_type,
    CASE WHEN COALESCE(e.region, '') <> ''
        THEN COUNT(1)  
        ELSE 0
    END
    as num_discharges_last_30_days,
    SUM(
        CASE WHEN COALESCE(e.region, '') <> '' 
        THEN COUNT(1)  
        ELSE 0
        END
        )
         OVER (ORDER BY R.MonthNum
               ROWS BETWEEN 2 PRECEDING AND CURRENT ROW 
               ) as num_discharges_last_90_days
    FROM R
    LEFT JOIN Encounter e
        ON R.MonthNum = from_unixtime(unix_timestamp(e.discharge_date, 'yyyy-MM-dd'),'MM')
    GROUP BY 
    R.MonthNum,
    e.region,
    e.facilityname,
    from_unixtime(unix_timestamp(e.discharge_date, 'yyyy-MM-dd'),'MMM-yyyy'),
    e.Payertype,
    e.Payernamme,
    e.patient_type
) A
WHERE A.region IS NOT NULL
;

`

1 个答案:

答案 0 :(得分:0)

我的同事使用以下查询破解了这个问题。它需要一个self-join和CASE&WHERE子句才能只计算最近3个月的计算。

WITH CTE AS (
    SELECT a.region,a.facilityname,a.payertype,a.payernamme,a.patient_type, LAST_DAY(a.discharge_date) AS month_year, COUNT(encounter_no) AS measure_1
    FROM Encounter AS a
    GROUP BY a.region,a.facilityname,a.payertype,a.payernamme,a.patient_type, LAST_DAY(a.discharge_date)
)
-- SELECT * FROM CTE AS a;
SELECT a.region,a.facilityname,a.payertype,a.payernamme,a.patient_type, a.month_year, MAX(a.measure_1) AS measure_1,
    SUM(IF(b.month_year IS NULL, a.measure_1, b.measure_1)) AS measure_2
FROM CTE AS a
    LEFT JOIN CTE AS b
        ON a.region = b.region
        AND a.facilityname = b.facilityname
        AND a.payertype = b.payertype
        AND a.payernamme = b.payernamme
        AND a.patient_type = b.patient_type
WHERE ( b.month_year BETWEEN add_months(a.month_year, -2) AND a.month_year
        OR b.month_year IS NULL)
GROUP BY a.region,a.facilityname,a.payertype,a.payernamme,a.patient_type, a.month_year;