UPDATE:我正在寻找不相交的集合的交集,从而更新了原始表格文件。就像在我的大数据集中一样,每个批次都有可用的所有时间戳dt
。虽然我的版本仍然不起作用......
我正在分析距离最近的车辆的距离是否会影响到打电话的概率。为此我有真正的数据,我想通过随机化结果n
次来生成"假" 数据。在假随机集中,根据定义应该没有相关性。然后我将真实与假数据进行比较作为我的基准。
我正在寻找有关在Postgres 9.3中实现此目的的建议。目前我正在尝试使用一个函数,该函数将随机列n
的数量作为参数。
输入small_panel
表来自真实的观察,有很多lotid
,有一列called = 1
,如果被调用,NULL
如果没有,一列distance
具有各种值。我想生成n
列(called_1, called_2, ... ,called_n
),其中选择了随机抽签lotid
,但每个时间段dt
的调用次数相同。
例如,在表中有2个带有时间戳dt = 2009-06-05 22:30:00
的调用,所以我希望仍然有2个带有该时间戳的调用,但随机调整哪个批次将调用放在每个伪列中
我非常感谢您的意见。
这里的玩具桌很小(15行)但是我要处理的玩具表很大(例如300万行)所以如果它太慢我可能需要稍后进行优化。
原始输入表small_panel
看起来像这样(我已经放了一个csv和一个pgsql文件来创建表,而我的工作函数不是here):
select * from small_panel order by dt;
gid | id | lotid | called | distance | dt
--------+------+--------+--------+----------+---------------------
536596 | 207 | 2904 | | 392.648 | 2009-06-05 12:40:00
538417 | 2025 | 230328 | 1 | 69.9698 | 2009-06-05 12:40:00
537409 | 1017 | 5453 | 1 | 190.071 | 2009-06-05 12:40:00
629637 | 1017 | 5453 | | 226.278 | 2009-06-05 19:00:00
630647 | 2025 | 230328 | | 387.914 | 2009-06-05 19:00:00
628826 | 207 | 2904 | 1 | 439.769 | 2009-06-05 19:00:00
644185 | 1017 | 5453 | | 124.361 | 2009-06-05 20:00:00
645195 | 2025 | 230328 | | 214.799 | 2009-06-05 20:00:00
643377 | 207 | 2904 | 1 | 8.50651 | 2009-06-05 20:00:00
670887 | 1017 | 5453 | 1 | 81.0408 | 2009-06-05 21:50:00
671896 | 2025 | 230328 | | 193.953 | 2009-06-05 21:50:00
670076 | 207 | 2904 | | 135.042 | 2009-06-05 21:50:00
679781 | 207 | 2904 | | 170.979 | 2009-06-05 22:30:00
680594 | 1017 | 5453 | | 223.304 | 2009-06-05 22:30:00
681605 | 2025 | 230328 | 1 | 92.0443 | 2009-06-05 22:30:00
(15 rows)
我尝试了以下功能(没有内循环,几乎可以工作但结果不对):
DROP FUNCTION ib_randomise_calls(int);
DROP TABLE IF EXISTS panel_000;
CREATE OR REPLACE FUNCTION ib_randomise_calls ( n INT DEFAULT 3 )
RETURNS TEXT AS
$func$
DECLARE mytext TEXT DEFAULT 'small_panel';
DECLARE r record;
BEGIN
DROP TABLE IF EXISTS panel_000;
CREATE TABLE panel_000 AS ( SELECT * FROM small_panel );
ALTER TABLE panel_000 ADD COLUMN calledz INT;
DROP TABLE IF EXISTS panel_000_times;
CREATE TABLE panel_000_times AS ( SELECT distinct on (dt) * FROM panel_000 where called is not null );
DROP TABLE IF EXISTS panel_000_lots;
CREATE TABLE panel_000_lots AS ( SELECT distinct lotID FROM panel_000 order by 1);
FOR i IN 1..n LOOP
RAISE NOTICE 'i: %', i;
EXECUTE format( $x1$ ALTER TABLE panel_000 ADD COLUMN called_%1$s INT; $x1$, i);
UPDATE panel_000 a SET calledZ = 1
from panel_000_times b , panel_000_lots c
where a.dt = b.dt
AND a.lotid IN ( select lotID from panel_000_lots
order by random()
limit (select count(*) from panel_000 d where called=1 and a.dt=d.dt) ) ;
EXECUTE format(
$x2$
UPDATE panel_000 a SET called_%1$s = 1
from panel_000_times b , panel_000_lots c
where a.dt = b.dt
AND a.lotid IN ( select lotID from panel_000_lots
order by random()
limit (select count(*) from panel_000 d where called=1 and a.dt=d.dt) ) ;
$x2$, i);
END LOOP ;
RETURN mytext;
END;
$func$ LANGUAGE plpgsql;
select ib_randomise_calls(4);
哪会生成列但结果错误(每个时间戳dt
的调用次数不同):
select dt, count(*), count(distinct lotid) lots , sum(called) calls, sum(calledz) cz, sum(called_1) c1, sum(called_2) c2, sum(called_3) c3, sum(called_4) c4 from panel_000 group by 1 order by 1;
dt | count | lots | calls | cz | c1 | c2 | c3 | c4
---------------------+-------+------+-------+----+----+----+----+----
2009-06-05 12:40:00 | 3 | 3 | 2 | 3 | 3 | 2 | 1 | 2
2009-06-05 19:00:00 | 3 | 3 | 1 | 3 | | 1 | 1 | 1
2009-06-05 20:00:00 | 3 | 3 | 1 | 2 | | 2 | 1 | 1
2009-06-05 21:50:00 | 3 | 3 | 1 | 3 | 2 | | | 2
2009-06-05 22:30:00 | 3 | 3 | 1 | 1 | 2 | 3 | 1 | 1
(5 rows)
select 'TOT .' as dt,count(*), count(distinct lotid) lots , sum(called) calls, sum(calledz) cz, sum(called_1) c1, sum(called_2) c2, sum(called_3) c3, sum(called_4) c4 from panel_000;
dt | count | lots | calls | cz | c1 | c2 | c3 | c4
---------------------+-------+------+-------+----+----+----+----+----
TOT . | 15 | 3 | 6 | 12 | 7 | 8 | 4 | 7
(1 row)
select lotid, count(*) , sum(called) calls, sum(calledz) cz, sum(called_1) c1, sum(called_2) c2, sum(called_3) c3, sum(called_4) c4 from panel_000 group by 1 order by 1;
lotid | count | calls | cz | c1 | c2 | c3 | c4
--------+-------+-------+----+----+----+----+----
2904 | 5 | 2 | 5 | 3 | 3 | | 3
5453 | 5 | 2 | 4 | 1 | 3 | 3 | 2
230328 | 5 | 2 | 3 | 3 | 2 | 1 | 2
(3 rows)
答案 0 :(得分:1)
select
small_panel.*, c1, c2, c3
from
small_panel
left join (
select
dt,
unnest(a_lotid) as lotid,
unnest((select array_agg(a order by random()) from unnest(a) a(a))) as c1,
unnest((select array_agg(a order by random()) from unnest(a) a(a))) as c2,
unnest((select array_agg(a order by random()) from unnest(a) a(a))) as c3
from
(
select
dt,
array_agg(lotid) a_lotid,
count(called is not null or null)::int as total_called
from small_panel
group by 1
) s
cross join lateral (
select
array_fill(1, array[total_called]) ||
array_fill(null::int, array[array_length(a_lotid, 1) - total_called])
) a(a)
) s using (dt, lotid)
输出
gid | id | lotid | called | distance | dt | c1 | c2 | c3
--------+------+--------+--------+----------+---------------------+----+----+----
536596 | 207 | 2904 | | 392.648 | 2009-06-05 12:40:00 | | | 1
537409 | 1017 | 5453 | 1 | 190.071 | 2009-06-05 12:40:00 | 1 | 1 |
538417 | 2025 | 230328 | 1 | 69.9698 | 2009-06-05 12:40:00 | 1 | 1 | 1
628826 | 207 | 2904 | 1 | 439.769 | 2009-06-05 19:00:00 | | | 1
629637 | 1017 | 5453 | | 226.278 | 2009-06-05 19:00:00 | 1 | 1 |
630647 | 2025 | 230328 | | 387.914 | 2009-06-05 19:00:00 | | |
643377 | 207 | 2904 | 1 | 8.50651 | 2009-06-05 20:00:00 | | | 1
644185 | 1017 | 5453 | | 124.361 | 2009-06-05 20:00:00 | 1 | 1 |
645195 | 2025 | 230328 | | 214.799 | 2009-06-05 20:00:00 | | |
670076 | 207 | 2904 | | 135.042 | 2009-06-05 21:50:00 | | | 1
670887 | 1017 | 5453 | 1 | 81.0408 | 2009-06-05 21:50:00 | 1 | 1 |
671896 | 2025 | 230328 | | 193.953 | 2009-06-05 21:50:00 | | |
679781 | 207 | 2904 | | 170.979 | 2009-06-05 22:30:00 | 1 | |
680594 | 1017 | 5453 | | 223.304 | 2009-06-05 22:30:00 | | |
681605 | 2025 | 230328 | 1 | 92.0443 | 2009-06-05 22:30:00 | | 1 | 1
我不明白calledZ
的含义所以我没有包含它。
这是没有循环且没有中间表的动态函数:
create or replace function ib_randomise_calls (_n int default 3)
returns void as $func$
begin
drop table if exists panel_000;
execute format($$
create table panel_000 as
select
small_panel.*, %1$s
from
small_panel
left join (
select
dt,
unnest(a_lotid) as lotid, %2$s
from
(
select
dt,
array_agg(lotid) a_lotid,
count(called is not null or null)::int as total_called
from small_panel
group by 1
) s
cross join lateral (
select
array_fill(1, array[total_called]) ||
array_fill(null::int, array[array_length(a_lotid, 1) - total_called])
) a(a)
) s using (dt, lotid)
$$,
(
select string_agg(format('%s%s', a, i), ', ')
from unnest(array_fill('called_'::text, array[_n])) with ordinality s (a, i)
),
(
select string_agg(format('%s%s', a, i), ', ')
from unnest(array_fill('unnest((
select array_agg(a order by random())
from unnest(a) a(a)
)) as called_'::text, array[_n])
) with ordinality s (a, i)
)
);
end;
$func$ language plpgsql;
答案 1 :(得分:0)
使用Clodoaldo Neto的查询工作动态函数:
DROP FUNCTION ib_randomise_calls(int);
DROP TABLE IF EXISTS panel_000;
CREATE OR REPLACE FUNCTION ib_randomise_calls ( n INT DEFAULT 3 )
RETURNS TEXT AS
$func$
DECLARE mytext TEXT DEFAULT 'small_panel';
BEGIN
DROP TABLE IF EXISTS panel_000;
CREATE TABLE panel_000 AS ( SELECT * FROM small_panel );
FOR i IN 1..n LOOP
RAISE NOTICE 'i: %', i;
EXECUTE format( $x1$ ALTER TABLE panel_000 ADD COLUMN called_%1$s INT; $x1$, i);
DROP TABLE IF EXISTS panel_i;
CREATE TABLE panel_i AS (
select
small_panel.*, Ci
from
small_panel
left join (
select
dt,
unnest(a_lotid) as lotid,
unnest((select array_agg(a order by random()) from unnest(a) a(a))) as Ci
from
(
select
dt,
array_agg(lotid) a_lotid,
count(called is not null or null)::int as total_called
from small_panel
group by 1 ORDER BY 1
) s
cross join lateral (
select
array_fill(1, array[total_called]) ||
array_fill(null::int, array[array_length(a_lotid, 1) - total_called])
) a(a)
) s
using (dt, lotid)
);
EXECUTE format(
$x2$
UPDATE panel_000 a SET called_%1$s = Ci from panel_i b where a.dt=b.dt and a.lotid = b.lotid;
$x2$, i);
mytext:=i;
END LOOP ;
RETURN mytext;
END;
$func$ LANGUAGE plpgsql;
select ib_randomise_calls(5);