具有缺失值的BIGQUERY移动平均线

时间:2018-04-17 16:36:53

标签: sql google-bigquery moving-average

我有以下数据

with dummy_data as 
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)

我想计算每个id的移动平均值。我知道你可以做类似以下的事情

select 
    id
  , ref_month
  , avg(value) over (partition by id order by ref_month ROWS BETWEEN 5 PRECEDING AND CURRENT ROW ) as moving_avg
from 
    dummy_data

但是从我的虚拟数据中可以看出,有一些缺失的值。  当有一些缺失值时,有关如何轻松计算移动平均线的任何想法? 我想先计算一个完整的日期范围

date_range AS
(
  SELECT reference_month
  FROM UNNEST(
      GENERATE_DATE_ARRAY(PARSE_DATE('%Y-%m-%d', (SELECT MIN(ref_month) FROM dummy_data)), PARSE_DATE('%Y-%m-%d', (SELECT MAX(ref_month) FROM dummy_data)), INTERVAL 1 MONTH)
  ) AS reference_month
)

然后用ids做笛卡尔积,然后加入我的虚拟数据,但这似乎是反模式。关于如何以最佳方式做到这一点的任何想法? 感谢

编辑:

预期结果: 对于身份1:

2017-01-01  18
2017-02-01  19
2017-03-01  20
2017-05-01  18
2017-06-01  21.8
2017-07-01  26.2
2017-10-01  26
2017-11-01  30
2017-12-01  32.8

对于身份2:

2017-01-01  18
2017-02-01  19
2017-03-01  20
2017-04-01  22
2017-07-01  18.4
2017-08-01  25
2017-09-01  29.2
2017-11-01  40.6
2017-12-01  43.4

3 个答案:

答案 0 :(得分:1)

下面是BigQuery Standard SQL,实际上有效! :O)
它假定您的ref_month属于DATE数据类型(如果在您的情况下,您将其设为STRING - 仍然可以 - 请参阅我答案最底部的注释)

   
#standardSQL
SELECT 
  id, 
  ref_month,
  ROUND(SUM(value) OVER (rolling_six_days) / 
    (LAST_VALUE(month_pos) OVER (rolling_six_days) 
      - FIRST_VALUE(month_pos) OVER (rolling_six_days)
      + 1)
  ) AS correct_moving_avg
FROM (
  SELECT id, ref_month, value,
    DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
  FROM dummy_data
)
WINDOW rolling_six_days AS 
  (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )

您可以使用以下示例数据进行测试/播放

#standardSQL
WITH dummy_data AS (
  SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
  UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
  -- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
  UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
  -- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT 
  id, 
  ref_month,
  ROUND(SUM(value) OVER (rolling_six_days) / 
    (LAST_VALUE(month_pos) OVER (rolling_six_days) 
      - FIRST_VALUE(month_pos) OVER (rolling_six_days)
      + 1)
  ) AS correct_moving_avg
FROM (
  SELECT id, ref_month, value,
    DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
  FROM dummy_data
)
WINDOW rolling_six_days AS (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
ORDER BY 1,2  

帮助您探索逻辑 - 见下文"扩展"以上查询的版本 - 它具有传播到非常外部选择的所有偶数中间值,因此您可以看到所有内容......

#standardSQL
WITH dummy_data AS 
(
  SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
  UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
  -- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
  UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
  -- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT 
  id,
  ref_month,
  value,
  moving_sum,
  first_month,
  last_month,
  ROUND(moving_sum / (last_month - first_month + 1)) AS correct_moving_avg,
  moving_avg
FROM (
  SELECT
    id,
    ref_month,
    value,
    SUM(value) OVER (rolling_six_days) AS moving_sum,
    FIRST_VALUE(month_pos) OVER (rolling_six_days) AS first_month,
    LAST_VALUE(month_pos) OVER (rolling_six_days) AS last_month,
    AVG(value) OVER (rolling_six_days) AS moving_avg
  FROM (
    SELECT 
      id, ref_month, value,
      DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
    FROM dummy_data
  )
  WINDOW rolling_six_days AS 
    (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
)
ORDER BY 1,2   

结果为

id  ref_month   value moving_sum    first_month last_month  correct_moving_avg  moving_avg   
1    2017-01-01 18    18            12          12          18.0                  18.0   
1    2017-02-01 20    38            12          13          19.0                  19.0   
1    2017-03-01 22    60            12          14          20.0                  20.0   
1    2017-05-01 30    90            12          16          18.0                  22.5   
1    2017-06-01 37    127           12          17          21.0                  25.4   
1    2017-07-01 42    151           13          18          25.0                  30.2   
1    2017-10-01 51    160           16          21          27.0                  40.0   
1    2017-11-01 57    187           17          22          31.0                  46.75  
1    2017-12-01 56    206           18          23          34.0                  51.5   
2    2017-01-01 18    18            12          12          18.0                  18.0   
2    2017-02-01 20    38            12          13          19.0                  19.0   
2    2017-03-01 22    60            12          14          20.0                  20.0   
2    2017-04-01 28    88            12          15          22.0                  22.0   
2    2017-07-01 42    112           13          18          19.0                  28.0   
2    2017-08-01 55    147           14          19          25.0                  36.75  
2    2017-09-01 49    174           15          20          29.0                  43.5   
2    2017-11-01 57    203           18          22          41.0                  50.75  
2    2017-12-01 56    259           18          23          43.0                  51.8     

希望这能显示/解释你的方法

注意:如果您的ref_month字段属于STRING`数据,则应使用DATE_DIFF略微调整行数 - 它应该为

DATE_DIFF(cast(ref_month as DATE), '2016-01-01', MONTH) month_pos

注意2:我选择了2016-01-01'作为计算月数的起点 - 但您可以选择任何一个以确保它小于您的最小日期 - 例如' 2000-01-01'也将完美地工作

答案 1 :(得分:0)

这应该有效:

with dummy_data as 
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)


select 
    id
  , ref_month
  , avg(avg(value)) over (partition by id order by ref_month) as moving_avg
from 
    dummy_data
    group by id
  , ref_month

答案 2 :(得分:0)

如果您希望将值视为0并且您想要“5”,那么一系列lag()可能是最简单的方法:

select id, ref_month,
       (value +
        (case when lag(ref_month) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 1) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 2) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 2) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 3) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 3) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 4) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 4) over (partition by id order by ref_month)
              else 0
         end)
       ) / 
       least(5, date_diff(min(ref_month) over (partition by id), ref_month))
from dummy_data;

查询比逻辑更复杂。它基本上将五个最近的值除以5.然后它将边界条件变为影响(以及缺失值)。