Netezza Parition与排除特定记录

时间:2016-06-16 16:58:51

标签: sql stored-procedures netezza

我在Netezza的web_event表中有以下格式的一些数据。

vstr_id  |  sessn_id  |  sessn_ts            | wbpg_nm 
V1       |  V1S1      |  02-02-2015 09:20:00 | /home/login
V1       |  V1S1      |  02-02-2015 09:22:00 | -1
V1       |  V1S1      |  02-02-2015 09:30:00 | /home/contacts
V1       |  V1S1      |  02-02-2015 09:32:00 | -1
V1       |  V1S1      |  02-02-2015 09:50:00 | /home/search
V1       |  V1S1      |  02-02-2015 09:55:00 | -1
V2       |  V2S1      |  02-02-2015 09:10:00 | /home
V2       |  V2S1      |  02-02-2015 09:15:00 | /home/apps
V2       |  V2S2      |  02-02-2015 09:20:00 | /home/news
V2       |  V2S2      |  02-02-2015 09:23:00 | /home/news/internal

这是我的源表。

我正在尝试使用该web_event表并创建另一个表,如下所示。

我希望像下面一样加载sessn_durtn表和time_on_pg表。

1)time_on_page列:当前页面和下一页面加载之间的时间差,如果没有其他事件或页面加载,则会话的最后一页可以有0秒。它可以用几分钟或几秒表示。

Insert into time_on_pg (select VSTR_ID,
           SESSN_ID,
           sessn_ts,
           WBPG_NM,
           ????? as time_on_page
           from web_event)

vstr_id  |  sessn_id  |  sessn_ts            | wbpg_nm              | wanted_time_on_page   | currently_known_time_on_page
V1       |  V1S1      |  02-02-2015 09:20:00 | /home/login          |   10mins              |   2mins
V1       |  V1S1      |  02-02-2015 09:22:00 | -1                   |                       |   8mins
V1       |  V1S1      |  02-02-2015 09:30:00 | /home/contacts       |   20mins              |   2mins
V1       |  V1S1      |  02-02-2015 09:32:00 | -1                   |                       |   18mins
V1       |  V1S1      |  02-02-2015 09:50:00 | /home/search         |   5mins               |   5mins
V1       |  V1S1      |  02-02-2015 09:55:00 | -1                   |                       |   

V2       |  V2S1      |  02-02-2015 09:10:00 | /home                |   5mins               |   5mins
V2       |  V2S1      |  02-02-2015 09:15:00 | /home/apps           |                       |

V2       |  V2S2      |  02-02-2015 09:20:00 | /home/news           |   3mins               |   3mins
V2       |  V2S2      |  02-02-2015 09:23:00 | /home/news/internal  |                       |

我们如何在Netezza或任何SQL查询中执行此操作?

我有使用

计算current_known_time_on_page的逻辑
SELECT vstr_id,
   sessn_id,
   sessn_ts,
   wbpg_nm,
   ???????? AS wanted_time_on_page,
   extract(epoch from (lag(event_ts) over (partition by vstr_id, sessn_id order by event_ts DESC) - event_ts)) AS currently_known_time_on_page
   from web_event;

want_time_on_page和current_known_time_on_page之间的主要区别是在计算除最后一页之外的时差时消除“-1”页。

2 个答案:

答案 0 :(得分:2)

我不知道您的数据集有多大以及您有多少可用内存。此查询在内存中完成所有操作。您可以将每个CTE转换为临时表以提高速度。

WITH CTE_SessionOrder AS (
SELECT
     sessn_id
    ,sessn_ts       
    ,wbpg_nm
    ,ROW_NUMBER() OVER(PARTITION BY sessn_id ORDER BY sessn_ts DESC) AS RowNum  -- This is sorted Desc to get last row
FROM
    web_event
)
,CTE_KeepLastRow AS (
SELECT *
FROM
    CTE_SessionOrder
WHERE
    RowNum = 1
    AND wbpg_nm = '-1'
)
,CTE_OtherRows AS (
SELECT *
FROM
    CTE_SessionOrder
WHERE
    wbpg_nm != '-1'
)
,CTE_FilteredData AS (
SELECT sessn_id,sessn_ts,wbpg_nm FROM CTE_KeepLastRow
UNION
SELECT sessn_id,sessn_ts,wbpg_nm FROM CTE_OtherRows
)
,CTE_FilterOrderedData AS (
SELECT
     *
    ,ROW_NUMBER() OVER(PARTITION BY sessn_id ORDER BY sessn_ts) AS RowNum   -- Now Ordered Asc
FROM
    CTE_FilteredData
)
,CTE_FinalData AS (
SELECT
     D1.sessn_id
    ,D1.sessn_ts       
    ,D1.wbpg_nm
    ,DATEDIFF(mi,D1.sessn_ts,D2.sessn_ts) time_on_page
FROM
    CTE_FilterOrderedData D1
    LEFT JOIN CTE_FilterOrderedData D2
        ON  D1.sessn_id = D2.sessn_id
            AND D1.RowNum + 1 = D2.RowNum
UNION
SELECT
     sessn_id
    ,sessn_ts       
    ,wbpg_nm
    ,CAST(NULL AS INT) time_on_page
FROM
    CTE_SessionOrder
WHERE
    RowNum != 1
    AND wbpg_nm = '-1'
)
SELECT *
FROM
    CTE_FinalData

答案 1 :(得分:1)

我认为event_ts与sessn_ts相同????无论如何这里是一个应该适合您的查询,它使用OUTER APPLY技术查找表中(> sessn_ts)不是网页-1之后的结果,然后获取' s最高结果升序。

只需将表名更改为您的表格。

以下是主要使用outer apply但使用公用表格式(cte)来设置所需的最后'-1'的时间的解决方案。

;WITH cteMaxNeg1 AS (
    SELECT
       sessn_id
       ,MaxNeg1SessnTs = MAX(CASE WHEN we.webpg_nm = '-1' THEN we.sessn_ts ELSE NULL END)
       ,MaxPageSessnTs = MAX(CASE WHEN we.webpg_nm <> '-1' THEN we.sessn_ts ELSE NULL END)
    FROM
       @WebEvents we
    GROUP BY
       sessn_id
)

SELECT
    we.*
    ,currently_known_time_on_page = ISNULL(LAG(we.sessn_ts) over (partition by we.vstr_id, we.sessn_id order by we.sessn_ts DESC) - we.sessn_ts,CAST(0 AS DATETIME))
    ,WantedTimeOnPage = CASE
       WHEN we.sessn_ts = m.MaxPageSessnTs AND we.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,we.sessn_ts,m.MaxNeg1SessnTs)
       WHEN we.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,we.sessn_ts,o.sessn_ts)
       ELSE NULL
    END
FROM
    @WebEvents we
    LEFT JOIN cteMaxNeg1 m
    ON we.sessn_id = m.sessn_id
    OUTER APPLY (
       SELECT TOP 1sessn_ts
       FROM
          @WebEvents i
       WHERE
          i.webpg_nm <> '-1'
          AND i.sessn_id = we.sessn_id
          AND i.sessn_ts > we.sessn_ts

       ORDER BY
          i.sessn_ts ASC

    ) o
ORDER BY
    we.sessn_id
    ,we.sessn_ts

这是一个只使用CTE和窗口函数的解决方案

;WITH cte AS (
    SELECT
       *
       ,RowNum = ROW_NUMBER() OVER (PARTITION BY sessn_id, IIF(webpg_nm = '-1',0,1) ORDER BY sessn_ts)
       ,LastNeg1RowNum = ROW_NUMBER() OVER (PARTITION BY sessn_id, IIF(webpg_nm = '-1',0,1) ORDER BY sessn_ts DESC)
    FROM
       @WebEvents
)

SELECT
    c1.*
    ,WantedTimeOnPage = CASE
       WHEN c1.LastNeg1RowNum = 1 AND c1.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,c1.sessn_ts,c3.sessn_ts)
       WHEN c1.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,c1.sessn_ts,c2.sessn_ts)
       ELSE NULL
    END
FROM
    cte c1
    LEFT JOIN cte c2
    ON c1.sessn_id = c2.sessn_id
    AND (c1.RowNum + 1) = c2.RowNum
    AND c2.webpg_nm <> '-1'
    LEFT JOIN cte c3
    ON c1.sessn_id = c3.sessn_id
    AND c3.LastNeg1RowNum = 1 
    AND c3.webpg_nm = '-1'
ORDER BY
    c1.sessn_id
    ,c1.sessn_ts

我使用的测试数据:

DECLARE @WebEvents AS TABLE (vstr_id CHAR(2), sessn_id CHAR(5), sessn_ts DATETIME, webpg_nm VARCHAR(100))

INSERT INTO @WebEvents (vstr_id, sessn_id, sessn_ts, webpg_nm)
VALUES
('V1','V1S1','02-02-2015 09:20:00','/home/login')
,('V1','V1S1','02-02-2015 09:22:00','-1')
,('V1','V1S1','02-02-2015 09:30:00','/home/contacts')
,('V1','V1S1','02-02-2015 09:32:00','-1')
,('V1','V1S1','02-02-2015 09:50:00','/home/search')
,('V1','V1S1','02-02-2015 09:55:00','-1')
,('V2','V2S1','02-02-2015 09:10:00','/home')
,('V2','V2S1','02-02-2015 09:15:00','/home/apps')
,('V2','V2S2','02-02-2015 09:20:00','/home/news')
,('V2','V2S2','02-02-2015 09:23:00','/home/news/internal')