我正在尝试将InfluxDB查询迁移到Google Cloud BigQuery。
InfluxDB是一个时间序列数据库,因此按时间间隔进行聚合非常容易。给定此数据集:
name: h2o_feet
--------------
time water_level location
2015-08-18T00:00:00Z 8.12 coyote_creek
2015-08-18T00:00:00Z 2.064 santa_monica
2015-08-18T00:06:00Z 8.005 coyote_creek
2015-08-18T00:06:00Z 2.116 santa_monica
2015-08-18T00:12:00Z 7.887 coyote_creek
2015-08-18T00:12:00Z 2.028 santa_monica
2015-08-18T00:18:00Z 7.762 coyote_creek
2015-08-18T00:18:00Z 2.126 santa_monica
2015-08-18T00:24:00Z 7.635 coyote_creek
2015-08-18T00:24:00Z 2.041 santa_monica
2015-08-18T00:30:00Z 7.5 coyote_creek
2015-08-18T00:30:00Z 2.051 santa_monica
以下查询将查询结果分为12分钟间隔:
SELECT COUNT("water_level") FROM "h2o_feet" WHERE "location"='coyote_creek' AND time >= '2015-08-18T00:00:00Z' AND time <= '2015-08-18T00:30:00Z' GROUP BY time(12m)
name: h2o_feet
--------------
time count
2015-08-18T00:00:00Z 2
2015-08-18T00:12:00Z 2
2015-08-18T00:24:00Z 2
有人知道BigQuery中的GROUP BY time(12m)
部分是否直接与之等效?
洛朗
答案 0 :(得分:4)
BigQuery中没有直接等效的功能,但是您可以在Issue Tracker
同时,以下是我认为的解决方法
选项1
#standardSQL
SELECT MIN(time) time, COUNT(1) cnt
FROM `project.dataset.h2o_feet`
WHERE location = 'coyote_creek'
AND time BETWEEN '2015-08-18T00:00:00' AND '2015-08-18T00:30:00'
GROUP BY DIV(DATETIME_DIFF(time, '2015-08-18T00:00:00', MINUTE), 12)
选项2
更多罗version的版本(不确定我为什么会在第一个选项上使用以下选项-也许是为了试验代码)
#standardSQL
WITH start_finish AS (
SELECT DATETIME '2015-08-18T00:00:00' start, DATETIME '2015-08-18T00:30:00' finish, DATETIME '2000-01-01T00:00:00' base
), intervals AS (
SELECT pos1, pos2,
DATETIME_ADD(base, INTERVAL start_interval MINUTE) start,
DATETIME_ADD(base, INTERVAL finish_interval MINUTE) finish
FROM (
SELECT DATETIME_DIFF(start, base, MINUTE) start,
DATETIME_DIFF(finish, base, MINUTE) finish,
base
FROM start_finish
), UNNEST(GENERATE_ARRAY(start, finish, 12)) start_interval WITH OFFSET pos1,
UNNEST(GENERATE_ARRAY(start, finish + 12, 12)) finish_interval WITH OFFSET pos2
WHERE pos1 = pos2 - 1
)
SELECT start, COUNT(1) cnt
FROM `project.dataset.h2o_feet`
JOIN intervals
ON time >= start AND time < finish
WHERE location = 'coyote_creek'
GROUP BY start
在start_finish
CTE中,您只需要设置start
和finish
时间-其余时间由其余查询完成
您可以使用下面的问题中的虚拟数据来测试/使用
#standardSQL
WITH `project.dataset.h2o_feet` AS (
SELECT DATETIME '2015-08-18T00:00:00' time, 8.12 water_level, 'coyote_creek' location UNION ALL
SELECT DATETIME '2015-08-18T00:00:00', 2.064, 'santa_monica' UNION ALL
SELECT DATETIME '2015-08-18T00:06:00', 8.005, 'coyote_creek' UNION ALL
SELECT DATETIME '2015-08-18T00:06:00', 2.116, 'santa_monica' UNION ALL
SELECT DATETIME '2015-08-18T00:12:00', 7.887, 'coyote_creek' UNION ALL
SELECT DATETIME '2015-08-18T00:12:00', 2.028, 'santa_monica' UNION ALL
SELECT DATETIME '2015-08-18T00:18:00', 7.762, 'coyote_creek' UNION ALL
SELECT DATETIME '2015-08-18T00:18:00', 2.126, 'santa_monica' UNION ALL
SELECT DATETIME '2015-08-18T00:24:00', 7.635, 'coyote_creek' UNION ALL
SELECT DATETIME '2015-08-18T00:24:00', 2.041, 'santa_monica' UNION ALL
SELECT DATETIME '2015-08-18T00:30:00', 7.5, 'coyote_creek' UNION ALL
SELECT DATETIME '2015-08-18T00:30:00', 2.051, 'santa_monica'
), start_finish AS (
SELECT DATETIME '2015-08-18T00:00:00' start, DATETIME '2015-08-18T00:30:00' finish, DATETIME '2000-01-01T00:00:00' base
), intervals AS (
SELECT pos1, pos2,
DATETIME_ADD(base, INTERVAL start_interval MINUTE) start,
DATETIME_ADD(base, INTERVAL finish_interval MINUTE) finish
FROM (
SELECT DATETIME_DIFF(start, base, MINUTE) start,
DATETIME_DIFF(finish, base, MINUTE) finish,
base
FROM start_finish
), UNNEST(GENERATE_ARRAY(start, finish, 12)) start_interval WITH OFFSET pos1,
UNNEST(GENERATE_ARRAY(start, finish + 12, 12)) finish_interval WITH OFFSET pos2
WHERE pos1 = pos2 - 1
)
SELECT start, COUNT(1) cnt
FROM `project.dataset.h2o_feet`
JOIN intervals
ON time >= start AND time < finish
WHERE location = 'coyote_creek'
GROUP BY start
-- ORDER BY start
两个版本均产生以下结果
Row start cnt
1 2015-08-18T00:00:00 2
2 2015-08-18T00:12:00 2
3 2015-08-18T00:24:00 2
选项3-(很傻-但要使其看起来类似于GROUP BY time(12m)和问题的原始查询
#standardSQL
CREATE TEMP FUNCTION duration(time DATETIME) AS ((
DIV(DATETIME_DIFF(time, '2015-08-18T00:00:00', MINUTE), 12)
));
SELECT MIN(time) time, COUNT(1) cnt
FROM `project.dataset.h2o_feet`
WHERE location = 'coyote_creek'
AND time BETWEEN '2015-08-18T00:00:00' AND '2015-08-18T00:30:00'
GROUP BY duration(time)
ORDER BY time