SEO> SEO>支付1 付费>付费>会员>支付1 SEO>会员1I的查询会生成包含客户ID号,营销渠道,时间戳和购买日期的数据。因此,结果可能看起来像这样。
id marketingChannel TimeStamp Transaction_date
1 SEO 5/18 23:11:43 5/18
1 SEO 5/18 24:12:43 5/18
1 Paid 5/18 24:13:43 5/18
2 Paid 5/18 24:12:43 5/18
2 Paid 5/18 24:14:43 5/18
2 Affiliate 5/18 24:20:43 5/18
2 Paid 5/18 24:22:43 5/18
3 SEO 5/18 24:10:43 5/18
3 Affiliate 5/18 24:11:43 5/18
我想知道是否有查询以显示营销路径计数的方式聚合此信息。
例如。
Marketing Path Count
SEO > SEO > Paid 1
Paid > Paid > Affiliate > Paid 1
SEO > Affiliate 1
我正在考虑编写一个Python脚本来获取这些信息,但我想知道SQL中是否有一个简单的解决方案 - 因为我不熟悉SQL。
答案 0 :(得分:3)
几年前我需要一个类似的结果,我测试了在Teradata中获取串联字符串的不同方法。顺便说一下,如果行数太高而且连接的字符串超过64000个字符,则所有这些都可能失败。
效率最高的是用户定义函数(用C语言编写):
SELECT
PATH
,COUNT(*)
FROM
(
SELECT
DelimitedBuildSorted(MARKETINGCHANNEL
,CAST(CAST(ts AS FORMAT 'yyyymmddhhmiss') AS VARCHAR(14))
,'>') AS PATH
FROM t
GROUP BY id
) AS dt
GROUP BY 1;
如果您需要经常和/或在大型桌面上运行该查询,如果可以使用UDF,您可以与您的DBA交谈(大多数DBA不喜欢他们,因为他们用一种语言编写了他们不知道,C)。
如果每个id的平均行数很少,则递归可能没问题。 Joseph B的版本可以稍微简化,但最重要的是创建一个临时表而不是使用View或Derived Table进行ROW_NUMBER计算。这会产生更好的计划(在SQL Server中也是如此):
CREATE VOLATILE TABLE vt AS
(
SELECT
id
,MarketingChannel
,ROW_NUMBER() OVER (PARTITION BY id ORDER BY TS DESC) AS rn
,COUNT(*) OVER (PARTITION BY id) AS max_rn
FROM t
) WITH DATA
PRIMARY INDEX (id)
ON COMMIT PRESERVE ROWS;
WITH RECURSIVE cte(id, path, rn) AS
(
SELECT
id,
-- modify VARCHAR size to fit your maximum number of rows, that's better than VARCHAR(64000)
CAST(MarketingChannel AS VARCHAR(10000)) AS PATH,
rn
FROM vt
WHERE rn = max_rn
UNION ALL
SELECT
cte.ID,
cte.PATH || '>' || vt.MarketingChannel,
cte.rn-1
FROM vt JOIN cte
ON vt.id = cte.id
AND vt.rn = cte.rn - 1
)
SELECT
PATH,
COUNT(*)
FROM cte
WHERE rn = 1
GROUP BY path
ORDER BY PATH
;
您也可以尝试旧学校MAX(CASE):
SELECT
PATH
,COUNT(*)
FROM
(
SELECT
id
,MAX(CASE WHEN rnk = 0 THEN MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 1 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 2 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 3 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 4 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 5 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 6 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 7 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 8 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 9 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 10 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 11 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 12 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 13 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 14 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 15 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 16 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 17 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 18 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 19 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 20 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 21 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 22 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 23 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 24 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 25 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 26 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 27 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 28 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 29 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 30 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 31 THEN '>' || MarketingChannel ELSE '' END) AS PATH
FROM
(
SELECT
id
,TRIM(MarketingChannel) AS MarketingChannel
,RANK() OVER (PARTITION BY id
ORDER BY TS) -1 AS rnk
FROM t
) dt
GROUP BY 1
) AS dt
GROUP BY 1;
我最多要累计2048行,每行30个字符: - )
SELECT
PATH
,COUNT(*)
FROM
(
SELECT
id
,MAX(CASE WHEN rnk MOD 16 = 0 THEN path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 1 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 2 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 3 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 4 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 5 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 6 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 7 THEN '>' || path ELSE '' END) AS PATH
FROM
(
SELECT
id
,rnk / 16 AS rnk
,MAX(CASE WHEN rnk MOD 16 = 0 THEN path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 1 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 2 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 3 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 4 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 5 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 6 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 7 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 8 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 9 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 10 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 11 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 12 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 13 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 14 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 15 THEN '>' || path ELSE '' END) AS path
FROM
(
SELECT
id
,rnk / 16 AS rnk
,MAX(CASE WHEN rnk MOD 16 = 0 THEN path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 1 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 2 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 3 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 4 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 5 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 6 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 7 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 8 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 9 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 10 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 11 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 12 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 13 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 14 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 15 THEN '>' || path ELSE '' END) AS path
FROM
(
SELECT
id
,TRIM(MarketingChannel) AS PATH
,RANK() OVER (PARTITION BY id
ORDER BY TS) -1 AS rnk
FROM t
) dt
GROUP BY 1,2
) dt
GROUP BY 1,2
) dt
GROUP BY 1
) dt
GROUP BY 1
答案 1 :(得分:2)
这是一个已经过SQL Server测试的查询。相同的语法也适用于Teradata:
修改强>:
将多个CTE转换为单个CTE:
WITH RECURSIVE Single_Path (CURRENT_ID, CURRENT_PATH, CURRENT_TS, rn) AS
(
SELECT
ID CURRENT_ID,
CAST(MARKETINGCHANNEL AS VARCHAR(MAX)) CURRENT_PATH,
TIMESTAMP CURRENT_TS,
1 RN
FROM
(
SELECT
id,
marketingChannel,
TimeStamp,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY TimeStamp DESC) rn
FROM T
) Ordered_Data
WHERE RN = 1
UNION ALL
SELECT
ID,
CAST(MARKETINGCHANNEL + ' > ' + CURRENT_PATH AS VARCHAR(MAX)),
TIMESTAMP,
sp.rn+1
FROM
(
SELECT
id,
marketingChannel,
TimeStamp,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY TimeStamp DESC) rn
FROM T
) ORDERED_DATA od, Single_Path sp
WHERE od.id = sp.Current_id
AND od.rn = sp.rn + 1
)
SELECT
sp2.CURRENT_PATH MARKETING_PATH,
COUNT(*) COUNT
FROM Single_Path sp2
INNER JOIN
(
SELECT
ID,
MAX(rn) max_rn
FROM Ordered_Data
GROUP BY ID
) MR
ON SP2.CURRENT_ID = MR.ID AND SP2.RN = MR.MAX_RN
GROUP BY SP2.CURRENT_PATH
ORDER BY sp2.CURRENT_PATH;
<强>参考强>:
答案 2 :(得分:1)
假设MySQL:
select
path, count(*) from (
select
id, group_concat(marketingChannel separator ' > ') as path
from
t
group by id
) sq
group by path