在表postgres9.3

时间:2017-04-05 07:18:48

标签: postgresql loops random parameters postgresql-9.3

UPDATE:我正在寻找不相交的集合的交集,从而更新了原始表格文件。就像在我的大数据集中一样,每个批次都有可用的所有时间戳dt。虽然我的版本仍然不起作用......

我正在分析距离最近的车辆的距离是否会影响到打电话的概率。为此我有真正的数据,我想通过随机化结果n次来生成"假" 数据。在随机集中,根据定义应该没有相关性。然后我将真实数据进行比较作为我的基准。

问题:

我正在寻找有关在Postgres 9.3中实现此目的的建议。目前我正在尝试使用一个函数,该函数将随机列n的数量作为参数。

输入small_panel表来自真实的观察,有很多lotid,有一列called = 1,如果被调用,NULL如果没有,一列distance具有各种值。我想生成n列(called_1, called_2, ... ,called_n),其中选择了随机抽签lotid,但每个时间段dt的调用次数相同。

例如,在表中有2个带有时间戳dt = 2009-06-05 22:30:00的调用,所以我希望仍然有2个带有该时间戳的调用,但随机调整哪个批次将调用放在每个列中

我非常感谢您的意见。

要尝试的数据:

这里的玩具桌很小(15行)但是我要处理的玩具表很大(例如300万行)所以如果它太慢我可能需要稍后进行优化。

原始输入表small_panel看起来像这样(我已经放了一个csv和一个pgsql文件来创建表,而我的工作函数不是here):

select * from small_panel order by dt;

  gid   |  id  | lotid  | called | distance |         dt          
--------+------+--------+--------+----------+---------------------
 536596 |  207 |   2904 |        |  392.648 | 2009-06-05 12:40:00
 538417 | 2025 | 230328 |      1 |  69.9698 | 2009-06-05 12:40:00
 537409 | 1017 |   5453 |      1 |  190.071 | 2009-06-05 12:40:00
 629637 | 1017 |   5453 |        |  226.278 | 2009-06-05 19:00:00
 630647 | 2025 | 230328 |        |  387.914 | 2009-06-05 19:00:00
 628826 |  207 |   2904 |      1 |  439.769 | 2009-06-05 19:00:00
 644185 | 1017 |   5453 |        |  124.361 | 2009-06-05 20:00:00
 645195 | 2025 | 230328 |        |  214.799 | 2009-06-05 20:00:00
 643377 |  207 |   2904 |      1 |  8.50651 | 2009-06-05 20:00:00
 670887 | 1017 |   5453 |      1 |  81.0408 | 2009-06-05 21:50:00
 671896 | 2025 | 230328 |        |  193.953 | 2009-06-05 21:50:00
 670076 |  207 |   2904 |        |  135.042 | 2009-06-05 21:50:00
 679781 |  207 |   2904 |        |  170.979 | 2009-06-05 22:30:00
 680594 | 1017 |   5453 |        |  223.304 | 2009-06-05 22:30:00
 681605 | 2025 | 230328 |      1 |  92.0443 | 2009-06-05 22:30:00
(15 rows)

解决方案 - 尝试:

我尝试了以下功能(没有内循环,几乎可以工作但结果不对):

DROP FUNCTION ib_randomise_calls(int);
DROP TABLE IF EXISTS panel_000;

CREATE OR REPLACE FUNCTION ib_randomise_calls ( n INT DEFAULT 3 ) 
RETURNS TEXT AS
$func$
  DECLARE mytext TEXT DEFAULT 'small_panel';
  DECLARE r record;
BEGIN

  DROP TABLE IF EXISTS panel_000;
  CREATE TABLE panel_000 AS ( SELECT * FROM small_panel  );
  ALTER TABLE panel_000 ADD COLUMN calledz INT;

  DROP TABLE IF EXISTS panel_000_times;
  CREATE TABLE panel_000_times AS ( SELECT distinct on (dt) * FROM panel_000 where called is not null );

  DROP TABLE IF EXISTS panel_000_lots;
  CREATE TABLE panel_000_lots AS ( SELECT distinct lotID FROM panel_000 order by 1);    

  FOR i IN 1..n LOOP

      RAISE NOTICE 'i: %', i;
      EXECUTE format( $x1$ ALTER TABLE panel_000 ADD COLUMN called_%1$s INT; $x1$, i);

      UPDATE panel_000 a SET calledZ = 1  
          from panel_000_times b , panel_000_lots c
          where a.dt = b.dt
          AND a.lotid IN (  select lotID from panel_000_lots  
                            order by random() 
                            limit (select count(*) from panel_000 d where called=1 and a.dt=d.dt)   ) ;

      EXECUTE format( 
      $x2$ 
        UPDATE panel_000 a SET called_%1$s = 1  
            from panel_000_times b , panel_000_lots c
            where a.dt = b.dt
            AND a.lotid IN (  select lotID from panel_000_lots  
                              order by random() 
                              limit (select count(*) from panel_000 d where called=1 and a.dt=d.dt)   ) ;
      $x2$, i);    

  END LOOP ;
  RETURN mytext;
END;
$func$ LANGUAGE plpgsql;


select ib_randomise_calls(4);

哪会生成列但结果错误(每个时间戳dt的调用次数不同):

select dt, count(*), count(distinct lotid) lots , sum(called) calls, sum(calledz) cz, sum(called_1) c1, sum(called_2) c2, sum(called_3) c3, sum(called_4) c4 from panel_000 group by 1 order by 1;

         dt          | count | lots | calls | cz | c1 | c2 | c3 | c4 
---------------------+-------+------+-------+----+----+----+----+----
 2009-06-05 12:40:00 |     3 |    3 |     2 |  3 |  3 |  2 |  1 |  2
 2009-06-05 19:00:00 |     3 |    3 |     1 |  3 |    |  1 |  1 |  1
 2009-06-05 20:00:00 |     3 |    3 |     1 |  2 |    |  2 |  1 |  1
 2009-06-05 21:50:00 |     3 |    3 |     1 |  3 |  2 |    |    |  2
 2009-06-05 22:30:00 |     3 |    3 |     1 |  1 |  2 |  3 |  1 |  1
(5 rows)

 select 'TOT               .' as dt,count(*), count(distinct lotid) lots , sum(called) calls, sum(calledz) cz, sum(called_1) c1, sum(called_2) c2, sum(called_3) c3, sum(called_4) c4 from panel_000;
         dt          | count | lots | calls | cz | c1 | c2 | c3 | c4 
---------------------+-------+------+-------+----+----+----+----+----
 TOT               . |    15 |    3 |     6 | 12 |  7 |  8 |  4 |  7
(1 row)

select lotid, count(*) , sum(called) calls, sum(calledz) cz, sum(called_1) c1, sum(called_2) c2, sum(called_3) c3, sum(called_4) c4  from panel_000 group by 1 order by 1;

 lotid  | count | calls | cz | c1 | c2 | c3 | c4 
--------+-------+-------+----+----+----+----+----
   2904 |     5 |     2 |  5 |  3 |  3 |    |  3
   5453 |     5 |     2 |  4 |  1 |  3 |  3 |  2
 230328 |     5 |     2 |  3 |  3 |  2 |  1 |  2
(3 rows)

2 个答案:

答案 0 :(得分:1)

select
    small_panel.*, c1, c2, c3
from
    small_panel
    left join (
        select
            dt,
            unnest(a_lotid) as lotid,
            unnest((select array_agg(a order by random()) from unnest(a) a(a))) as c1,
            unnest((select array_agg(a order by random()) from unnest(a) a(a))) as c2,
            unnest((select array_agg(a order by random()) from unnest(a) a(a))) as c3
        from
            (
                select
                    dt,
                    array_agg(lotid) a_lotid,
                    count(called is not null or null)::int as total_called
                from small_panel
                group by 1
            ) s
            cross join lateral (
                select
                    array_fill(1, array[total_called]) ||
                    array_fill(null::int, array[array_length(a_lotid, 1) - total_called])
            ) a(a)
    ) s using (dt, lotid)

输出

  gid   |  id  | lotid  | called | distance |         dt          | c1 | c2 | c3 
--------+------+--------+--------+----------+---------------------+----+----+----
 536596 |  207 |   2904 |        |  392.648 | 2009-06-05 12:40:00 |    |    |  1
 537409 | 1017 |   5453 |      1 |  190.071 | 2009-06-05 12:40:00 |  1 |  1 |   
 538417 | 2025 | 230328 |      1 |  69.9698 | 2009-06-05 12:40:00 |  1 |  1 |  1
 628826 |  207 |   2904 |      1 |  439.769 | 2009-06-05 19:00:00 |    |    |  1
 629637 | 1017 |   5453 |        |  226.278 | 2009-06-05 19:00:00 |  1 |  1 |   
 630647 | 2025 | 230328 |        |  387.914 | 2009-06-05 19:00:00 |    |    |   
 643377 |  207 |   2904 |      1 |  8.50651 | 2009-06-05 20:00:00 |    |    |  1
 644185 | 1017 |   5453 |        |  124.361 | 2009-06-05 20:00:00 |  1 |  1 |   
 645195 | 2025 | 230328 |        |  214.799 | 2009-06-05 20:00:00 |    |    |   
 670076 |  207 |   2904 |        |  135.042 | 2009-06-05 21:50:00 |    |    |  1
 670887 | 1017 |   5453 |      1 |  81.0408 | 2009-06-05 21:50:00 |  1 |  1 |   
 671896 | 2025 | 230328 |        |  193.953 | 2009-06-05 21:50:00 |    |    |   
 679781 |  207 |   2904 |        |  170.979 | 2009-06-05 22:30:00 |  1 |    |   
 680594 | 1017 |   5453 |        |  223.304 | 2009-06-05 22:30:00 |    |    |   
 681605 | 2025 | 230328 |      1 |  92.0443 | 2009-06-05 22:30:00 |    |  1 |  1

我不明白calledZ的含义所以我没有包含它。

这是没有循环且没有中间表的动态函数:

create or replace function ib_randomise_calls (_n int default 3)
returns void as $func$
begin
drop table if exists panel_000;
execute format($$
    create table panel_000 as
    select
        small_panel.*, %1$s
    from
        small_panel
        left join (
            select
                dt,
                unnest(a_lotid) as lotid, %2$s
            from
                (
                    select
                        dt,
                        array_agg(lotid) a_lotid,
                        count(called is not null or null)::int as total_called
                    from small_panel
                    group by 1
                ) s
                cross join lateral (
                    select
                        array_fill(1, array[total_called]) ||
                        array_fill(null::int, array[array_length(a_lotid, 1) - total_called])
                ) a(a)
        ) s using (dt, lotid)
    $$,
    (
        select string_agg(format('%s%s', a, i), ', ')
        from unnest(array_fill('called_'::text, array[_n])) with ordinality s (a, i)
    ),
    (
        select string_agg(format('%s%s', a, i), ', ')
        from unnest(array_fill('unnest((
                select array_agg(a order by random())
                from unnest(a) a(a)
            )) as called_'::text, array[_n])
        ) with ordinality s (a, i)
    )
);
end;
$func$ language plpgsql;

答案 1 :(得分:0)

解决方案

使用Clodoaldo Neto的查询工作动态函数:

DROP FUNCTION ib_randomise_calls(int);
DROP TABLE IF EXISTS panel_000;
CREATE OR REPLACE FUNCTION ib_randomise_calls ( n INT DEFAULT 3 )
  RETURNS TEXT AS
$func$
DECLARE mytext TEXT DEFAULT 'small_panel';
BEGIN
  DROP TABLE IF EXISTS panel_000;
  CREATE TABLE panel_000 AS ( SELECT * FROM small_panel  );
  FOR i IN 1..n LOOP
    RAISE NOTICE 'i: %', i;
    EXECUTE format( $x1$ ALTER TABLE panel_000 ADD COLUMN called_%1$s INT; $x1$, i);
    DROP TABLE IF EXISTS panel_i;
    CREATE TABLE panel_i AS (
      select
        small_panel.*, Ci
      from
        small_panel
        left join (
                    select
                      dt,
                      unnest(a_lotid) as lotid,
                      unnest((select array_agg(a order by random()) from unnest(a) a(a))) as Ci
                    from
                      (
                        select
                          dt,
                          array_agg(lotid) a_lotid,
                          count(called is not null or null)::int as total_called
                        from small_panel
                        group by 1 ORDER BY 1
                      ) s
                      cross join lateral (
                                 select
                                   array_fill(1, array[total_called]) ||
                                   array_fill(null::int, array[array_length(a_lotid, 1) - total_called])
                                 ) a(a)
                  ) s
        using (dt, lotid)
    );
    EXECUTE format(
        $x2$
          UPDATE panel_000 a SET called_%1$s = Ci from panel_i b where a.dt=b.dt and a.lotid = b.lotid;
        $x2$, i);
    mytext:=i;
  END LOOP ;
  RETURN mytext;
END;
$func$ LANGUAGE plpgsql;

select ib_randomise_calls(5);