在Hive中使用CTE时加入错误

时间:2017-12-20 23:15:51

标签: mysql sql hive

我有两个查询正在进行类似的工作。 一个没有CTE,一个有CTE。我无法弄清楚为什么第二个查询在第一个查询完全没有结果的时候。

我花了最后两个小时尝试通过尝试各种连接来解决这个问题,并且在查询1中工作的相同连接在查询2中不起作用。我希望有人可以指导我。

第一个查询(返回结果):

WITH MessageCTE AS 
    (
    SELECT dt
    , id
    , ts
    , family
    , message_type
    , to_user
    , message_id
    , class
    FROM dhruv.MessageLatencyInformation_20171210_20171125_to_20171130_02 as latencydata
    INNER JOIN dhruv.UsersOn503AndAbove_20171201_200k as required_users
    ON latencydata.to_user = required_users.user_id
    )
SELECT COUNT(DISTINCT to_user) AS Users
, AVG(latency) AS AvgLatency
, AVG(CASE WHEN latency > 0 THEN latency ELSE NULL END) AS AvgLatency_Positive
, PERCENTILE(latency, 0.5) AS 50Percentile
, PERCENTILE(latency, 0.75) AS 75Percentile
, PERCENTILE(latency, 0.8) AS 80Percentile
, PERCENTILE(latency, 0.9) AS 90Percentile
, PERCENTILE(latency, 0.95) AS 95Percentile
, PERCENTILE(latency, 0.99) AS 99Percentile
FROM
    (
    SELECT a.dt, a.to_user, (latency_dl.ts - latency_pb.ts) as latency
    FROM 
        (
        SELECT dt
        , id, ts
        , family
        , message_type
        , to_user
        , message_id
        , class
        FROM MessageCTE
        WHERE class = 'pb'
        ) as latency_pb
    INNER JOIN 
        (SELECT dt
        , id
        , ts
        , family
        , message_type
        , to_user
        , message_id
        , class
        FROM MessageCTE
        WHERE class = 'rdl'
        AND family = 'stm'
        ) as latency_rdl
    ON latency_pb.dt = latency_rdl.dt and latency_pb.to_user = latency_rdl.to_user and latency_pb.id = latency_rdl.id
    INNER JOIN
        (
        SELECT dt
        , id
        , ts
        , family
        , message_type
        , to_user
        , message_id
        , class
        FROM MessageCTE
        WHERE class = 'dl'
        ) as latency_dl
    ON latency_rdl.dt = latency_dl.dt and latency_rdl.to_user = latency_dl.to_user and latency_rdl.id = latency_dl.id) AS UserLatency;

第一个查询输出: First Query Output

现在的第二个查询,是一个轻微的修改和所有相同的条件,但由于某种原因,它返回没有匹配。希望有人可以指导我,我只花了大约2个小时尝试一些加入,我无法弄清楚为什么他们没有发生。

第二次查询:

WITH MessageCTE_pb AS 
    (
    SELECT dt, id, ts, to_user
    FROM 
        (
        SELECT dt, id, min(ts) as ts, to_user
        FROM dhruv.MessageLatencyInformation_20171210_20171125_to_20171130_02
        WHERE class = 'pb'
        GROUP BY dt, to_user, id
        ) as latencydata
    INNER JOIN dhruv.UsersOn503AndAbove_20171201_200k as required_users
    ON latencydata.to_user = required_users.user_id
    )
, MessageCTE_dl AS 
    (
    SELECT dt, id, ts, to_use
    FROM
        (
        SELECT dt, id, max(ts) as ts, to_user 
        FROM dhruv.MessageLatencyInformation_20171210_20171125_to_20171130_02 
        WHERE class = 'dl' 
        GROUP BY dt, to_user, id
        ) as latencydata
    INNER JOIN dhruv.UsersOn503AndAbove_20171201_200k as required_users
    ON latencydata.to_user = required_users.user_id
    )
, MessageCTE_rdl AS 
    (
    SELECT dt, id, to_user
    FROM
        (
        SELECT DISTINCT dt, id, to_user 
        FROM dhruv.MessageLatencyInformation_20171210_20171125_to_20171130_02
        WHERE class = 'rdl' 
        AND family = 'stm'
        ) as latencydata 
    INNER JOIN dhruv.UsersOn503AndAbove_20171201_200k as required_users
    ON latencydata.to_user = required_users.user_id
    )
SELECT COUNT(DISTINCT to_user) AS Users 
, AVG(latency) AS AvgLatency 
, AVG(CASE WHEN latency > 0 THEN latency ELSE NULL END) AS AvgLatency_Positive 
, PERCENTILE(latency, 0.5) AS 50Percentile 
, PERCENTILE(latency, 0.75) AS 75Percentile 
, PERCENTILE(latency, 0.8) AS 80Percentile 
, PERCENTILE(latency, 0.9) AS 90Percentile 
, PERCENTILE(latency, 0.95) AS 95Percentile 
, PERCENTILE(latency, 0.99) AS 99Percentile
FROM
    (
    SELECT a.dt, a.to_user, (latency_dl.ts - latency_pb.ts) as latency
    FROM MessageCTE_pb as latency_pb
    INNER JOIN MessageCTE_rdl as latency_rdl
    ON latency_pb.dt = latency_rdl.dt and latency_pb.to_user = latency_rdl.to_user and latency_pb.id = latency_rdl.id
    INNER JOIN MessageCTE_dl as latency_dl
    ON latency_rdl.dt = latency_dl.dt and latency_rdl.to_user = latency_dl.to_user and latency_rdl.id = latency_dl.id) AS UserLatency;

谢谢!

第二个查询结果: Second Query Result

1 个答案:

答案 0 :(得分:1)

答案块中的另一个评论,所以我可以发布一堆SQL ...

这是什么结果?

WITH
    UserLatency AS 
(
    SELECT
        latencydata.dt,
        latencydata.to_user,
        latencydata.id,
        MAX(CASE WHEN latencydata.class = 'dl' THEN latencydata.ts END)
        -
        MIN(CASE WHEN latencydata.class = 'pb' THEN latencydata.ts END)
            AS latency
    FROM
        dhruv.MessageLatencyInformation_20171210_20171125_to_20171130_02   AS latencydata
    INNER JOIN
        dhruv.UsersOn503AndAbove_20171201_200k                             AS required_users
            ON latencydata.to_user = required_users.user_id
    GROUP BY
        latencydata.dt,
        latencydata.to_user,
        latencydata.id
    HAVING
        0 < SUM(CASE WHEN latencydata.class  = 'rdl'
                      AND latencydata.family = 'stm' THEN 1 END)
)
SELECT
      COUNT(DISTINCT to_user)                       AS Users
    , AVG(latency)                                  AS AvgLatency 
    , AVG(CASE WHEN latency > 0 THEN latency END)   AS AvgLatency_Positive 
    , PERCENTILE(latency, 0.50)                     AS 50Percentile 
    , PERCENTILE(latency, 0.75)                     AS 75Percentile 
    , PERCENTILE(latency, 0.80)                     AS 80Percentile 
    , PERCENTILE(latency, 0.90)                     AS 90Percentile 
    , PERCENTILE(latency, 0.95)                     AS 95Percentile 
    , PERCENTILE(latency, 0.99)                     AS 99Percentile
FROM
    UserLatency
;