我在Netezza的web_event表中有以下格式的一些数据。
vstr_id | sessn_id | sessn_ts | wbpg_nm
V1 | V1S1 | 02-02-2015 09:20:00 | /home/login
V1 | V1S1 | 02-02-2015 09:22:00 | -1
V1 | V1S1 | 02-02-2015 09:30:00 | /home/contacts
V1 | V1S1 | 02-02-2015 09:32:00 | -1
V1 | V1S1 | 02-02-2015 09:50:00 | /home/search
V1 | V1S1 | 02-02-2015 09:55:00 | -1
V2 | V2S1 | 02-02-2015 09:10:00 | /home
V2 | V2S1 | 02-02-2015 09:15:00 | /home/apps
V2 | V2S2 | 02-02-2015 09:20:00 | /home/news
V2 | V2S2 | 02-02-2015 09:23:00 | /home/news/internal
这是我的源表。
我正在尝试使用该web_event表并创建另一个表,如下所示。
我希望像下面一样加载sessn_durtn表和time_on_pg表。
1)time_on_page列:当前页面和下一页面加载之间的时间差,如果没有其他事件或页面加载,则会话的最后一页可以有0秒。它可以用几分钟或几秒表示。
Insert into time_on_pg (select VSTR_ID,
SESSN_ID,
sessn_ts,
WBPG_NM,
????? as time_on_page
from web_event)
vstr_id | sessn_id | sessn_ts | wbpg_nm | wanted_time_on_page | currently_known_time_on_page
V1 | V1S1 | 02-02-2015 09:20:00 | /home/login | 10mins | 2mins
V1 | V1S1 | 02-02-2015 09:22:00 | -1 | | 8mins
V1 | V1S1 | 02-02-2015 09:30:00 | /home/contacts | 20mins | 2mins
V1 | V1S1 | 02-02-2015 09:32:00 | -1 | | 18mins
V1 | V1S1 | 02-02-2015 09:50:00 | /home/search | 5mins | 5mins
V1 | V1S1 | 02-02-2015 09:55:00 | -1 | |
V2 | V2S1 | 02-02-2015 09:10:00 | /home | 5mins | 5mins
V2 | V2S1 | 02-02-2015 09:15:00 | /home/apps | |
V2 | V2S2 | 02-02-2015 09:20:00 | /home/news | 3mins | 3mins
V2 | V2S2 | 02-02-2015 09:23:00 | /home/news/internal | |
我们如何在Netezza或任何SQL查询中执行此操作?
我有使用
计算current_known_time_on_page的逻辑SELECT vstr_id,
sessn_id,
sessn_ts,
wbpg_nm,
???????? AS wanted_time_on_page,
extract(epoch from (lag(event_ts) over (partition by vstr_id, sessn_id order by event_ts DESC) - event_ts)) AS currently_known_time_on_page
from web_event;
want_time_on_page和current_known_time_on_page之间的主要区别是在计算除最后一页之外的时差时消除“-1”页。
答案 0 :(得分:2)
我不知道您的数据集有多大以及您有多少可用内存。此查询在内存中完成所有操作。您可以将每个CTE转换为临时表以提高速度。
WITH CTE_SessionOrder AS (
SELECT
sessn_id
,sessn_ts
,wbpg_nm
,ROW_NUMBER() OVER(PARTITION BY sessn_id ORDER BY sessn_ts DESC) AS RowNum -- This is sorted Desc to get last row
FROM
web_event
)
,CTE_KeepLastRow AS (
SELECT *
FROM
CTE_SessionOrder
WHERE
RowNum = 1
AND wbpg_nm = '-1'
)
,CTE_OtherRows AS (
SELECT *
FROM
CTE_SessionOrder
WHERE
wbpg_nm != '-1'
)
,CTE_FilteredData AS (
SELECT sessn_id,sessn_ts,wbpg_nm FROM CTE_KeepLastRow
UNION
SELECT sessn_id,sessn_ts,wbpg_nm FROM CTE_OtherRows
)
,CTE_FilterOrderedData AS (
SELECT
*
,ROW_NUMBER() OVER(PARTITION BY sessn_id ORDER BY sessn_ts) AS RowNum -- Now Ordered Asc
FROM
CTE_FilteredData
)
,CTE_FinalData AS (
SELECT
D1.sessn_id
,D1.sessn_ts
,D1.wbpg_nm
,DATEDIFF(mi,D1.sessn_ts,D2.sessn_ts) time_on_page
FROM
CTE_FilterOrderedData D1
LEFT JOIN CTE_FilterOrderedData D2
ON D1.sessn_id = D2.sessn_id
AND D1.RowNum + 1 = D2.RowNum
UNION
SELECT
sessn_id
,sessn_ts
,wbpg_nm
,CAST(NULL AS INT) time_on_page
FROM
CTE_SessionOrder
WHERE
RowNum != 1
AND wbpg_nm = '-1'
)
SELECT *
FROM
CTE_FinalData
答案 1 :(得分:1)
我认为event_ts与sessn_ts相同????无论如何这里是一个应该适合您的查询,它使用OUTER APPLY
技术查找表中(> sessn_ts)
不是网页-1
之后的结果,然后获取' s最高结果升序。
只需将表名更改为您的表格。
以下是主要使用outer apply
但使用公用表格式(cte
)来设置所需的最后'-1'
的时间的解决方案。
;WITH cteMaxNeg1 AS (
SELECT
sessn_id
,MaxNeg1SessnTs = MAX(CASE WHEN we.webpg_nm = '-1' THEN we.sessn_ts ELSE NULL END)
,MaxPageSessnTs = MAX(CASE WHEN we.webpg_nm <> '-1' THEN we.sessn_ts ELSE NULL END)
FROM
@WebEvents we
GROUP BY
sessn_id
)
SELECT
we.*
,currently_known_time_on_page = ISNULL(LAG(we.sessn_ts) over (partition by we.vstr_id, we.sessn_id order by we.sessn_ts DESC) - we.sessn_ts,CAST(0 AS DATETIME))
,WantedTimeOnPage = CASE
WHEN we.sessn_ts = m.MaxPageSessnTs AND we.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,we.sessn_ts,m.MaxNeg1SessnTs)
WHEN we.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,we.sessn_ts,o.sessn_ts)
ELSE NULL
END
FROM
@WebEvents we
LEFT JOIN cteMaxNeg1 m
ON we.sessn_id = m.sessn_id
OUTER APPLY (
SELECT TOP 1sessn_ts
FROM
@WebEvents i
WHERE
i.webpg_nm <> '-1'
AND i.sessn_id = we.sessn_id
AND i.sessn_ts > we.sessn_ts
ORDER BY
i.sessn_ts ASC
) o
ORDER BY
we.sessn_id
,we.sessn_ts
这是一个只使用CTE和窗口函数的解决方案
;WITH cte AS (
SELECT
*
,RowNum = ROW_NUMBER() OVER (PARTITION BY sessn_id, IIF(webpg_nm = '-1',0,1) ORDER BY sessn_ts)
,LastNeg1RowNum = ROW_NUMBER() OVER (PARTITION BY sessn_id, IIF(webpg_nm = '-1',0,1) ORDER BY sessn_ts DESC)
FROM
@WebEvents
)
SELECT
c1.*
,WantedTimeOnPage = CASE
WHEN c1.LastNeg1RowNum = 1 AND c1.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,c1.sessn_ts,c3.sessn_ts)
WHEN c1.webpg_nm <> '-1' THEN DATEDIFF(MINUTE,c1.sessn_ts,c2.sessn_ts)
ELSE NULL
END
FROM
cte c1
LEFT JOIN cte c2
ON c1.sessn_id = c2.sessn_id
AND (c1.RowNum + 1) = c2.RowNum
AND c2.webpg_nm <> '-1'
LEFT JOIN cte c3
ON c1.sessn_id = c3.sessn_id
AND c3.LastNeg1RowNum = 1
AND c3.webpg_nm = '-1'
ORDER BY
c1.sessn_id
,c1.sessn_ts
我使用的测试数据:
DECLARE @WebEvents AS TABLE (vstr_id CHAR(2), sessn_id CHAR(5), sessn_ts DATETIME, webpg_nm VARCHAR(100))
INSERT INTO @WebEvents (vstr_id, sessn_id, sessn_ts, webpg_nm)
VALUES
('V1','V1S1','02-02-2015 09:20:00','/home/login')
,('V1','V1S1','02-02-2015 09:22:00','-1')
,('V1','V1S1','02-02-2015 09:30:00','/home/contacts')
,('V1','V1S1','02-02-2015 09:32:00','-1')
,('V1','V1S1','02-02-2015 09:50:00','/home/search')
,('V1','V1S1','02-02-2015 09:55:00','-1')
,('V2','V2S1','02-02-2015 09:10:00','/home')
,('V2','V2S1','02-02-2015 09:15:00','/home/apps')
,('V2','V2S2','02-02-2015 09:20:00','/home/news')
,('V2','V2S2','02-02-2015 09:23:00','/home/news/internal')