按照时间顺序,我试图创建一个分区值的运行/累积median
。基本上我有一张桌子:
create table "SomeData"
(
ClientId INT,
SomeData DECIMAL(10,2),
SomeDate TIMESTAMP
);
有些数据:
INSERT INTO "SomeData" (ClientId, SomeData, SomeDate) VALUES
(1, 1, '1 Jan 2000'),
(1, 2, '2 Jan 2000'),
(1, 3, '3 Jan 2000'),
(1, 4, '4 Jan 2000'),
(2, 100, '1 Jan 2000'),
(2, 100, '2 Jan 2000'),
(2, 100, '3 Jan 2000'),
(2, 200, '4 Jan 2000'),
(2, 200, '5 Jan 2000'),
(2, 200, '6 Jan 2000'),
(2, 200, '7 Jan 2000');
我需要一个按ClientId
分区的运行中位数,按SomeDate
排序。
基本上,我需要制作的是:
ClientId SomeDate Median of SomeData
1 "2000-01-01" 1.000
1 "2000-01-02" 1.500
1 "2000-01-03" 2.000
1 "2000-01-04" 2.500
2 "2000-01-01" 100.0
2 "2000-01-02" 100.0
2 "2000-01-03" 100.0
2 "2000-01-04" 100.0
2 "2000-01-05" 100.0
2 "2000-01-06" 150.0
2 "2000-01-07" 200.0
我可以在PostgresSql 9.x中使用Aggregate_median
function以多种方式执行此操作,但是在Redshift中这很难实现,而Redshift只有一个聚合中值
SELECT ClientId, SomeDate, median(SomeData) OVER (PARTITION BY ClientId ORDER BY SomeDate)
FROM "SomeData" xout
ORDER BY ClientId, SomeDate;
然后在Redshift上运行上面的内容会出错:
错误:窗口规范不应包含框架子句和窗口函数中位数的顺序
中位数可以用手动相关子查询替换回原始表,但RedShift似乎也不支持这些。
在PostGres中工作的错误:由于内部错误
,不支持此类型的相关子查询模式
Here are a bunch of fiddles,其中没有一个在Redshift中工作
此时看起来我需要将数据提取到内存和do this in code,但如果可以直接在Redshift中完成,那将非常感激。
答案 0 :(得分:2)
我想知道你是否可以使用nth_value()
执行此操作:
SELECT ClientId, SomeDate,
NTH_VALUE(seqnum / 2) OVER (PARTITION BY ClientId ORDER BY SomeDate)
FROM (SELECT s.*,
COUNT(*) OVER (PARTITION BY ClientId ORDER BY SomeDate) as seqnum
FROM SomeData s
) s
ORDER BY ClientId, SomeDate;
请注意:使用COUNT(*)
代替ROW_NUMBER()
需要一些人习惯。
答案 1 :(得分:2)
我认为@GordonLinoff提出的解决方案是不正确的,因为它不会使用您要查找其中间值的值来对行进行排序。受以下因素启发的正确方法:
在redshift上工作:
WITH CTE
AS
(
SELECT ClientId,
ROW_NUMBER() OVER (PARTITION BY ClientId ORDER BY SomeDate ASC) row_num,
SomeDate,
SomeData
FROM "SomeData"
)
SELECT A.SomeDate,
A.SomeData,
(SELECT MEDIAN(B.SomeData)
FROM CTE B
WHERE B.row_num BETWEEN 1 AND A.row_num
GROUP BY A.ClientId) AS median
FROM CTE A
答案 2 :(得分:1)
这是您正在寻找的数量的精确计算。
本身并不性感,但它能正确处理奇数与偶数长度的中位数。
with row_numbers as (
SELECT d.partitionField -- the field (or fields) you are partitioning the window function by
, d.orderField -- your sort field for the window functions
, d.medianField -- quantity your are computing the median of
, ROW_NUMBER()
OVER (PARTITION BY partitionField ORDER BY orderField) as seqnum
FROM data d
)
, medians as (
SELECT nth_value(medianField, CASE
WHEN mod(seqnum, 2) = 0 THEN (seqnum/2)::int
ELSE ((seqnum/2)::int + 1)
END)
OVER (PARTITION BY partitionField ORDER BY orderField ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as median1
, nth_value(medianField, (seqnum/2)::int + 1) OVER (PARTITION BY partitionField ORDER BY orderField ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as median2
, mod(seqnum, 2) as mod1
FROM row_numbers
ORDER BY partitionField, orderField
)
select CASE
when mod(mod1,2) = 0
then ((median1 + median2)/2)::FLOAT
else median1
end as median
from medians