我有以下数据
with dummy_data as
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)
我想计算每个id的移动平均值。我知道你可以做类似以下的事情
select
id
, ref_month
, avg(value) over (partition by id order by ref_month ROWS BETWEEN 5 PRECEDING AND CURRENT ROW ) as moving_avg
from
dummy_data
但是从我的虚拟数据中可以看出,有一些缺失的值。 当有一些缺失值时,有关如何轻松计算移动平均线的任何想法? 我想先计算一个完整的日期范围
date_range AS
(
SELECT reference_month
FROM UNNEST(
GENERATE_DATE_ARRAY(PARSE_DATE('%Y-%m-%d', (SELECT MIN(ref_month) FROM dummy_data)), PARSE_DATE('%Y-%m-%d', (SELECT MAX(ref_month) FROM dummy_data)), INTERVAL 1 MONTH)
) AS reference_month
)
然后用ids做笛卡尔积,然后加入我的虚拟数据,但这似乎是反模式。关于如何以最佳方式做到这一点的任何想法? 感谢
编辑:
预期结果: 对于身份1:
2017-01-01 18
2017-02-01 19
2017-03-01 20
2017-05-01 18
2017-06-01 21.8
2017-07-01 26.2
2017-10-01 26
2017-11-01 30
2017-12-01 32.8
对于身份2:
2017-01-01 18
2017-02-01 19
2017-03-01 20
2017-04-01 22
2017-07-01 18.4
2017-08-01 25
2017-09-01 29.2
2017-11-01 40.6
2017-12-01 43.4
答案 0 :(得分:1)
下面是BigQuery Standard SQL,实际上有效! :O)
它假定您的ref_month属于DATE
数据类型(如果在您的情况下,您将其设为STRING
- 仍然可以 - 请参阅我答案最底部的注释)
#standardSQL
SELECT
id,
ref_month,
ROUND(SUM(value) OVER (rolling_six_days) /
(LAST_VALUE(month_pos) OVER (rolling_six_days)
- FIRST_VALUE(month_pos) OVER (rolling_six_days)
+ 1)
) AS correct_moving_avg
FROM (
SELECT id, ref_month, value,
DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
FROM dummy_data
)
WINDOW rolling_six_days AS
(PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
您可以使用以下示例数据进行测试/播放
#standardSQL
WITH dummy_data AS (
SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
-- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
-- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
-- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
-- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT
id,
ref_month,
ROUND(SUM(value) OVER (rolling_six_days) /
(LAST_VALUE(month_pos) OVER (rolling_six_days)
- FIRST_VALUE(month_pos) OVER (rolling_six_days)
+ 1)
) AS correct_moving_avg
FROM (
SELECT id, ref_month, value,
DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
FROM dummy_data
)
WINDOW rolling_six_days AS (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
ORDER BY 1,2
帮助您探索逻辑 - 见下文"扩展"以上查询的版本 - 它具有传播到非常外部选择的所有偶数中间值,因此您可以看到所有内容......
#standardSQL
WITH dummy_data AS
(
SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
-- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
-- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
-- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
-- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT
id,
ref_month,
value,
moving_sum,
first_month,
last_month,
ROUND(moving_sum / (last_month - first_month + 1)) AS correct_moving_avg,
moving_avg
FROM (
SELECT
id,
ref_month,
value,
SUM(value) OVER (rolling_six_days) AS moving_sum,
FIRST_VALUE(month_pos) OVER (rolling_six_days) AS first_month,
LAST_VALUE(month_pos) OVER (rolling_six_days) AS last_month,
AVG(value) OVER (rolling_six_days) AS moving_avg
FROM (
SELECT
id, ref_month, value,
DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
FROM dummy_data
)
WINDOW rolling_six_days AS
(PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
)
ORDER BY 1,2
结果为
id ref_month value moving_sum first_month last_month correct_moving_avg moving_avg
1 2017-01-01 18 18 12 12 18.0 18.0
1 2017-02-01 20 38 12 13 19.0 19.0
1 2017-03-01 22 60 12 14 20.0 20.0
1 2017-05-01 30 90 12 16 18.0 22.5
1 2017-06-01 37 127 12 17 21.0 25.4
1 2017-07-01 42 151 13 18 25.0 30.2
1 2017-10-01 51 160 16 21 27.0 40.0
1 2017-11-01 57 187 17 22 31.0 46.75
1 2017-12-01 56 206 18 23 34.0 51.5
2 2017-01-01 18 18 12 12 18.0 18.0
2 2017-02-01 20 38 12 13 19.0 19.0
2 2017-03-01 22 60 12 14 20.0 20.0
2 2017-04-01 28 88 12 15 22.0 22.0
2 2017-07-01 42 112 13 18 19.0 28.0
2 2017-08-01 55 147 14 19 25.0 36.75
2 2017-09-01 49 174 15 20 29.0 43.5
2 2017-11-01 57 203 18 22 41.0 50.75
2 2017-12-01 56 259 18 23 43.0 51.8
希望这能显示/解释你的方法
注意:如果您的ref_month
字段属于STRING`数据,则应使用DATE_DIFF略微调整行数 - 它应该为
DATE_DIFF(cast(ref_month as DATE), '2016-01-01', MONTH) month_pos
注意2:我选择了2016-01-01'作为计算月数的起点 - 但您可以选择任何一个以确保它小于您的最小日期 - 例如' 2000-01-01'也将完美地工作
答案 1 :(得分:0)
这应该有效:
with dummy_data as
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)
select
id
, ref_month
, avg(avg(value)) over (partition by id order by ref_month) as moving_avg
from
dummy_data
group by id
, ref_month
答案 2 :(得分:0)
如果您希望将值视为0并且您想要“5”,那么一系列lag()
可能是最简单的方法:
select id, ref_month,
(value +
(case when lag(ref_month) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
then lag(value, 1) over (partition by id order by ref_month)
else 0
end) +
(case when lag(ref_month, 2) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
then lag(value, 2) over (partition by id order by ref_month)
else 0
end) +
(case when lag(ref_month, 3) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
then lag(value, 3) over (partition by id order by ref_month)
else 0
end) +
(case when lag(ref_month, 4) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
then lag(value, 4) over (partition by id order by ref_month)
else 0
end)
) /
least(5, date_diff(min(ref_month) over (partition by id), ref_month))
from dummy_data;
查询比逻辑更复杂。它基本上将五个最近的值除以5.然后它将边界条件变为影响(以及缺失值)。