我正在尝试从超市的交易中识别购物篮。
我有客户ID和交易ID以及从货架上挑选物品的时间 - 交易。我确实有一种篮子ID,但它并不准确。我可以看到篮子中的交易是在一天的不同时间,所以我可以看到他们应该是不同的篮子_s。这是我无法做任何事情的数据中的缺陷。
如果一项交易与另一项交易之间的差异大于20分钟,那么我可以看到它是一个不同的篮子。我这样做是使用SQL中的Lag函数。
我在事务中添加一个标志,其值为1,其中该时间大于20分钟。所以我有篮子ID,实际上是1 +篮子。
我有什么想法可以创建real_basket_id吗?
非常感谢
安德鲁
答案 0 :(得分:0)
作为变体,您可以尝试使用递归。 看看我的例子。
CREATE TABLE #baskets(
buyer_id int,
basket_id int,
trans_time datetime
)
INSERT #baskets(buyer_id,basket_id,trans_time)VALUES
(1,11,DATETIMEFROMPARTS(2017,12,14,1,0,0,0)),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,5,0,0)),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,15,0,0)),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,50,0,0)),
(2,21,DATETIMEFROMPARTS(2017,12,14,2,0,0,0)),
(2,22,DATETIMEFROMPARTS(2017,12,14,2,45,0,0))
SELECT *
FROM #baskets
ORDER BY buyer_id,trans_time
;WITH numBaskCTE AS(
SELECT
buyer_id,
basket_id,
trans_time,
ROW_NUMBER()OVER(PARTITION BY buyer_id ORDER BY trans_time) n
FROM #baskets
),
checkBaskCTE AS(
SELECT
buyer_id,
basket_id,
trans_time,
n,
basket_id real_basket_id,
trans_time prev_time
FROM numBaskCTE
WHERE n=1
UNION ALL
SELECT
n.buyer_id,
n.basket_id,
n.trans_time,
n.n,
IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.basket_id,n.basket_id),
IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.prev_time,n.trans_time) prev_time
FROM checkBaskCTE c
JOIN numBaskCTE n ON n.buyer_id=c.buyer_id AND n.n=c.n+1
)
SELECT
buyer_id,
basket_id,
trans_time,
real_basket_id
FROM checkBaskCTE
ORDER BY buyer_id,trans_time
DROP TABLE #baskets
如果您有列real_basket_id
,那么您只能对新行WHERE real_basket_id IS NULL
使用更新。
CREATE TABLE #baskets(
buyer_id int,
basket_id int,
trans_time datetime,
real_basket_id int
)
INSERT #baskets(buyer_id,basket_id,trans_time,real_basket_id)VALUES
(1,10,DATETIMEFROMPARTS(2017,12,12,21,40,0,0),10),
(1,11,DATETIMEFROMPARTS(2017,12,13,22,30,0,0),11),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,0,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,5,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,15,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,50,0,0),NULL),
(2,21,DATETIMEFROMPARTS(2017,12,14,2,0,0,0),NULL),
(2,22,DATETIMEFROMPARTS(2017,12,14,2,45,0,0),NULL),
(3,30,DATETIMEFROMPARTS(2017,12,12,21,40,0,0),30),
(3,31,DATETIMEFROMPARTS(2017,12,14,0,54,0,0),31),
(3,32,DATETIMEFROMPARTS(2017,12,14,1,0,0,0),NULL),
(3,33,DATETIMEFROMPARTS(2017,12,14,1,5,0,0),NULL)
SELECT *
FROM #baskets
WHERE real_basket_id IS NULL -- only new rows
ORDER BY buyer_id,trans_time
;WITH numBaskCTE AS(
-- all new transactions + one last transaction for each buyers
SELECT
buyer_id,
basket_id,
real_basket_id,
trans_time,
ROW_NUMBER()OVER(PARTITION BY buyer_id ORDER BY trans_time) n
FROM
(
SELECT *,LEAD(real_basket_id)OVER(PARTITION BY buyer_id ORDER BY trans_time) next_real_basket_id
FROM #baskets
) q
WHERE next_real_basket_id IS NULL
),
checkBaskCTE AS(
SELECT
buyer_id,
basket_id,
trans_time,
n,
ISNULL(real_basket_id,basket_id) real_basket_id,
trans_time prev_time,
IIF(real_basket_id IS NULL,1,0) is_new_row
FROM numBaskCTE
WHERE n=1
UNION ALL
SELECT
n.buyer_id,
n.basket_id,
n.trans_time,
n.n,
IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.basket_id,n.basket_id),
IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.prev_time,n.trans_time) prev_time,
1 is_new_row
FROM checkBaskCTE c
JOIN numBaskCTE n ON n.buyer_id=c.buyer_id AND n.n=c.n+1
)
UPDATE b
SET
b.real_basket_id=q.real_basket_id
FROM #baskets b
JOIN
(
SELECT
buyer_id,
basket_id,
trans_time,
real_basket_id
FROM checkBaskCTE
WHERE is_new_row=1
) q
ON b.buyer_id=q.buyer_id AND b.trans_time=q.trans_time
SELECT *
FROM #baskets
ORDER BY buyer_id,trans_time
DROP TABLE #baskets