我必须从vertica数据库中的表中的数据计算加权中位数。我写了一个查询,该查询基于_weight变量(也使用lag函数)来计算_mbr列分区和中位数的运行总和。我正在寻找的最终输出是_mbr,_var,_weighted_median。当权重不相等时,查询会提供正确的结果,但权重相等时会产生多于一行。
在这一点上,我仍然不确定这是否是在vertica中实现加权中值计算的最佳方法,如果是,那么还需要其他什么来纠正它?
create table median_data
(_mbr int, _var int, _weight int); commit;
delete from median_data;commit;
insert into median_data (_mbr,_var,_weight) values (1,10,10);commit;
insert into median_data (_mbr,_var,_weight) values (1,100,25);commit;
insert into median_data (_mbr,_var,_weight) values (1,1000,36);commit;
insert into median_data (_mbr,_var,_weight) values (2,20,30);commit;
insert into median_data (_mbr,_var,_weight) values (3,40,40);commit;
insert into median_data (_mbr,_var,_weight) values (4,40,50);commit;
-------------------------------------------------------------------
with
median_calcs as
(
select * ,case when _lowvalue <> _highvalue then (lag_var+_var)/2
else _var end as median_var
from
(
select distinct _mbr, _var, _weight
,sum(_weight) over(partition by _mbr order by _weight) as _runsum
--,sum(_weight) over(partition by _mbr) as total
--,sum(_weight) over(partition by _mbr order by _weight)/sum(_var)
over(partition by _mbr) as _pct
,floor(sum(_weight) over(partition by _mbr)/2) as _lowvalue
,ceil (sum(_weight) over(partition by _mbr)/2) as _highvalue
,lag(_var) over(partition by _mbr order by _mbr,_weight) as lag_var
from median_data
) runsums
where _lowvalue <= _runsum and _runsum >= _highvalue
)
select * from median_calcs;