Question

我希望在我的数据中编号条纹，目标是找到np标记至少3个连续条纹的位置。

以下是我的数据的一个子集：

drop table if exists bi_test;
create table test (id varchar(12),rd date,np decimal);

insert into test
select 'aaabbbccc', '2016-07-25'::date, 0 union all
select 'aaabbbccc', '2016-08-01'::date, 0 union all
select 'aaabbbccc', '2016-08-08'::date, 0 union all
select 'aaabbbccc', '2016-08-15'::date, 0 union all
select 'aaabbbccc', '2016-08-22'::date, 1 union all
select 'aaabbbccc', '2016-08-29'::date, 0 union all
select 'aaabbbccc', '2016-09-05'::date, 1 union all
select 'aaabbbccc', '2016-09-12'::date, 0 union all
select 'aaabbbccc', '2016-09-19'::date, 1;

我希望使用row_number（）和count（），但它似乎没有给我我想要的结果。

select
    *
   ,row_number() over (partition by t.id order by t.rd) all_ctr
   ,count(t.id) over (partition by t.id) all_count
   ,row_number() over (partition by t.id,t.np order by t.rd) np_counter
   ,count(t.id) over (partition by t.id,t.np) np_non_np
from
    bi_adhoc.test t
order by
    t.rd;

以下是我的结果，以及所需的结果：

id          rd              np    all_ctr   all_count   np_counter  np_non_np   **Desired**
aaabbbccc   7/25/2016        0      1          9           1           6           **1**
aaabbbccc   8/1/2016         0      2          9           2           6           **2**
aaabbbccc   8/8/2016         0      3          9           3           6           **3**
aaabbbccc   8/15/2016        0      4          9           4           6           **4**
aaabbbccc   8/22/2016        1      5          9           1           3           **1**
aaabbbccc   8/29/2016        0      6          9           5           6           **1**
aaabbbccc   9/5/2016         1      7          9           2           3           **1**
aaabbbccc   9/12/2016        0      8          9           6           6           **1**
aaabbbccc   9/19/2016        1      9          9           3           3           **1**

Answer 1

这样做的一种方法是计算CTE中的滞后（np）值，然后比较当前的np和滞后的np以检测条纹。这可能不是最佳方式，但似乎工作正常。

with source_cte as 
(
select
    *
   ,row_number() over (partition by t.id order by t.rd) row_num
   ,lag(np,1) over (partition by t.id order by t.rd) as prev_np
from
    bi_adhoc.test t
)
, streak_cte as
(
select
    *,
    case when np=prev_np or prev_np is NULL then 1 else 0 end as is_streak
from 
    source_cte
)
select 
    *, 
    case when is_streak=1 then dense_rank() over (partition by id, is_streak order by rd) else 1 end as desired
from 
    streak_cte 
order by 
    rd;

Answer 2

首先，我添加了一些额外的数据来帮助充分说明问题...

drop table if exists bi_adhoc.test;
create table bi_adhoc.test (id varchar(12),period date,hit decimal);

insert into bi_adhoc.test
select 'aaabbbccc', '2016-07-25'::date, 0 union all
select 'aaabbbccc', '2016-08-01'::date, 0 union all
select 'aaabbbccc', '2016-08-08'::date, 0 union all
select 'aaabbbccc', '2016-08-15'::date, 1 union all
select 'aaabbbccc', '2016-08-22'::date, 1 union all
select 'aaabbbccc', '2016-08-29'::date, 0 union all
select 'aaabbbccc', '2016-09-05'::date, 0 union all
select 'aaabbbccc', '2016-09-12'::date, 1 union all
select 'aaabbbccc', '2016-09-19'::date, 0 union all
select 'aaabbbccc', '2016-09-26'::date, 1 union all
select 'aaabbbccc', '2016-10-03'::date, 1 union all
select 'aaabbbccc', '2016-10-10'::date, 1 union all
select 'aaabbbccc', '2016-10-17'::date, 1 union all
select 'aaabbbccc', '2016-10-24'::date, 1 union all
select 'aaabbbccc', '2016-10-31'::date, 0 union all
select 'aaabbbccc', '2016-11-07'::date, 0 union all
select 'aaabbbccc', '2016-11-14'::date, 0 union all
select 'aaabbbccc', '2016-11-21'::date, 0 union all
select 'aaabbbccc', '2016-11-28'::date, 0 union all
select 'aaabbbccc', '2016-12-05'::date, 1 union all
select 'aaabbbccc', '2016-12-12'::date, 1;

然后关键是弄清楚条纹是什么以及如何识别每条条纹以便我可以对数据进行分区以便对数据进行分区。

select
    *
    ,case
        when t1.hit = 1 then row_number() over (partition by t1.id,t1.hit_partition order by t1.period)
        when t1.hit = 0 then row_number() over (partition by t1.id,t1.miss_partition order by t1.period)
    else null
end desired
from
(
select
    *
    ,row_number() over (partition by  t.id order by t.id,t.period)
    ,case
        when t.hit = 1 then row_number() over (partition by t.id, t.hit order by t.period)
        else null
    end hit_counter
    ,case
        when t.hit = 1 then row_number() over (partition by  t.id order by t.id,t.period) - row_number() over (partition by t.id, t.hit order by t.period)
        else null
    end hit_partition
    ,case
        when t.hit = 0 then row_number() over (partition by t.id, t.hit order by t.period)
        else null
    end miss_counter
    ,case
        when t.hit = 0 then row_number() over (partition by  t.id order by t.id,t.period) - row_number() over (partition by t.id, t.hit order by t.period)
        else null
    end miss_partition
from
    bi_adhoc.test t
) t1
order by
    t1.id
    ,t1.period;

结果：

id          period          hit     row_number  hit_counter hit_partition   miss_counter    miss_partition  desired
aaabbbccc   2016-07-25      0       1           NULL        NULL            1               0               1
aaabbbccc   2016-08-01      0       2           NULL        NULL            2               0               2
aaabbbccc   2016-08-08      0       3           NULL        NULL            3               0               3
aaabbbccc   2016-08-15      1       4           1           3               NULL            NULL            1
aaabbbccc   2016-08-22      1       5           2           3               NULL            NULL            2
aaabbbccc   2016-08-29      0       6           NULL        NULL            4               2               1
aaabbbccc   2016-09-05      0       7           NULL        NULL            5               2               2
aaabbbccc   2016-09-12      1       8           3           5               NULL            NULL            1
aaabbbccc   2016-09-19      0       9           NULL        NULL            6               3               1
aaabbbccc   2016-09-26      1       10          4           6               NULL            NULL            1
aaabbbccc   2016-10-03      1       11          5           6               NULL            NULL            2
aaabbbccc   2016-10-10      1       12          6           6               NULL            NULL            3
aaabbbccc   2016-10-17      1       13          7           6               NULL            NULL            4
aaabbbccc   2016-10-24      1       14          8           6               NULL            NULL            5
aaabbbccc   2016-10-31      0       15          NULL        NULL            7               8               1
aaabbbccc   2016-11-07      0       16          NULL        NULL            8               8               2
aaabbbccc   2016-11-14      0       17          NULL        NULL            9               8               3
aaabbbccc   2016-11-21      0       18          NULL        NULL            10              8               4
aaabbbccc   2016-11-28      0       19          NULL        NULL            11              8               5
aaabbbccc   2016-12-05      1       20          9           11              NULL            NULL            1
aaabbbccc   2016-12-12      1       21          10          11              NULL            NULL            2

有趣的row_number（） - Redshift Postgres - 时间顺序和重新开始编号

2 个答案: