我有这张桌子
+-----+-----+------------+----------+---+
| Row | id | start_time | end_time | |
+-----+-----+------------+----------+---+
| 1 | foo | 18:00:00 | 22:00:00 | |
| 2 | bar | 19:00:00 | 23:00:00 | |
| 3 | baz | 08:00:00 | 11:00:00 | |
| 4 | qux | 05:30:00 | 07:30:00 | |
+-----+-----+------------+----------+---+
可以使用此
生成WITH TABLE AS (
SELECT "foo" AS id, TIME(18,0,0) AS start_time, TIME(22,0,0) AS end_time
UNION ALL
SELECT "bar", TIME(19,0,0), TIME(23,0,0)
UNION ALL
SELECT "baz", TIME(08,0,0), TIME(11,0,0)
UNION ALL
SELECT "qux", TIME(05,30,0), TIME(07,30,0)
)
SELECT * FROM TABLE
如果22:00小时在时间间隔内,则需要在一小时左右分割该行。
对于给定的输入,输出应为:
+-----+-----+------------+----------+---+
| Row | id | start_time | end_time | |
+-----+-----+------------+----------+---+
| 1 | foo | 18:00:00 | 22:00:00 | |
| 2 | bar | 19:00:00 | 22:00:00 | |
| 3 | bar | 22:00:00 | 23:00:00 | |
| 4 | baz | 08:00:00 | 11:00:00 | |
| 5 | qux | 05:30:00 | 06:00:00 | |
| 6 | qux | 06:00:00 | 07:30:00 | |
+-----+-----+------------+----------+---+
请注意,分割行的开头和结尾都使用了22:00小时。
此外,如果小时为06:00
,则需要复制相同的确切行为是否可以直接使用BigQuery进行此行为?
答案 0 :(得分:3)
我认为你可以有06:00到22:00之间的间隔,所以需要在三个时间间隔分开
以下示例适用于BigQuery Standard SQL,并允许您轻松(但绝对不是优雅地)通过为每个断点级联相同的逻辑来扩展某些合理(2-5)断点的方法,如下所示
#standardSQL
CREATE TEMP FUNCTION split_interval(start_time TIME, end_time TIME, break TIME)
RETURNS ARRAY<STRUCT<start_time TIME, end_time TIME>> AS (
IF(break > start_time AND break < end_time,
[STRUCT<start_time TIME, end_time TIME>(start_time, break), (break, end_time)],
[STRUCT<start_time TIME, end_time TIME>(start_time, end_time)]
)
);
WITH `project.dataset.table` AS (
SELECT "foo" AS id, TIME(18,0,0) AS start_time, TIME(22,0,0) AS end_time UNION ALL
SELECT "bar", TIME(19,0,0), TIME(23,0,0) UNION ALL
SELECT "baz", TIME(08,0,0), TIME(11,0,0) UNION ALL
SELECT "qux", TIME(05,30,0), TIME(07,30,0) UNION ALL
SELECT "xxx", TIME(05,45,0), TIME(23,30,0)
)
SELECT id, new_interval.* FROM (
SELECT id, new_interval.* FROM `project.dataset.table`,
UNNEST(split_interval(start_time, end_time, TIME(22,00,0))) new_interval
),
UNNEST(split_interval(start_time, end_time, TIME(06,00,0))) new_interval
-- ORDER BY id, start_time
在上面的例子中 - 原始间隔如下
Row id start_time end_time
1 bar 19:00:00 23:00:00
2 baz 08:00:00 11:00:00
3 foo 18:00:00 22:00:00
4 qux 05:30:00 07:30:00
5 xxx 05:45:00 23:30:00
,结果是
Row id start_time end_time
1 bar 19:00:00 22:00:00
2 bar 22:00:00 23:00:00
3 baz 08:00:00 11:00:00
4 foo 18:00:00 22:00:00
5 qux 05:30:00 06:00:00
6 qux 06:00:00 07:30:00
7 xxx 05:45:00 06:00:00
8 xxx 06:00:00 22:00:00
9 xxx 22:00:00 23:30:00
正如你在这里看到的 - 如果你有第3个断点 - 你只需要在现有的位置添加另一个外部选择 - 如下所示
SELECT id, new_interval.* FROM (
existing selects ...
),
UNNEST(split_interval(start_time, end_time, TIME(08,00,0))) new_interval
正是那部分使得上面不够优雅:o)
答案 1 :(得分:3)
下面是BigQuery Standard SQL并提供了非常通用的解决方案,您可以通过在breaks
表达式中“登记”它们来获得任意数量的断点
#standardSQL
WITH `project.dataset.table` AS (
SELECT "foo" AS id, TIME(18,0,0) AS start_time, TIME(22,0,0) AS end_time UNION ALL
SELECT "bar", TIME(19,0,0), TIME(23,0,0) UNION ALL
SELECT "baz", TIME(08,0,0), TIME(11,0,0) UNION ALL
SELECT "qux", TIME(05,30,0), TIME(07,30,0) UNION ALL
SELECT "xxx", TIME(05,45,0), TIME(23,30,0)
), breaks AS (
SELECT break FROM UNNEST([TIME(6,0,0), TIME(10,0,0), TIME(22,0,0)]) break
), temp AS (
SELECT id, start_time, end_time,
ARRAY_AGG(break ORDER BY break) break
FROM `project.dataset.table` CROSS JOIN breaks
WHERE break > start_time AND break < end_time
GROUP BY id, start_time, end_time
)
SELECT id, point start_time,
IFNULL(next_point, point) end_time
FROM (
SELECT id, point,
LEAD(point) OVER(PARTITION BY id ORDER BY point) next_point
FROM temp, UNNEST(ARRAY_CONCAT([start_time], break, [end_time])) point
)
WHERE NOT next_point IS NULL
-- ORDER BY id, point
在上面的示例中,我设置了三个断点--06:00,10:00和22:00
和初始间隔为
Row id start_time end_time
1 bar 19:00:00 23:00:00
2 baz 08:00:00 11:00:00
3 foo 18:00:00 22:00:00
4 qux 05:30:00 07:30:00
5 xxx 05:45:00 23:30:00
结果是:
Row id start_time end_time
1 bar 19:00:00 22:00:00
2 bar 22:00:00 23:00:00
3 baz 08:00:00 10:00:00
4 baz 10:00:00 11:00:00
5 qux 05:30:00 06:00:00
6 qux 06:00:00 07:30:00
7 xxx 05:45:00 06:00:00
8 xxx 06:00:00 10:00:00
9 xxx 10:00:00 22:00:00
10 xxx 22:00:00 23:30:00
答案 2 :(得分:1)
这是一种方法:
WITH TABLE AS (
SELECT "foo" AS id, TIME(18,0,0) AS start_time, TIME(22,0,0) AS end_time
UNION ALL
SELECT "bar", TIME(19,0,0), TIME(23,0,0)
UNION ALL
SELECT "baz", TIME(08,0,0), TIME(11,0,0)
UNION ALL
SELECT "qux", TIME(05,30,0), TIME(07,30,0)
)
SELECT id, intervals.*
FROM TABLE,
UNNEST(IF(
start_time < TIME '22:00:00' AND end_time > TIME '22:00:00',
[STRUCT<start_time TIME, end_time TIME>(start_time, TIME '22:00:00'),(TIME '22:00:00',end_time)],
[STRUCT<start_time TIME, end_time TIME>(start_time, end_time)])) intervals
答案 3 :(得分:0)
我对@ mikhail-berlyant提供的解决方案进行了一些小调整,这是最终的查询。
我在查询中将EDIT
标记为我所做的调整
#standardSQL
WITH
`project.dataset.table` AS (
-- across split point
SELECT "bar" AS id, TIME(18,0,0) AS start_time, TIME(23,0,0) AS end_time UNION ALL
-- end at split point
SELECT "foo", TIME(21,45,0), TIME(22,0,0) UNION ALL
-- start from split point
SELECT "fuz", TIME(22,0,0), TIME(23,30,0) UNION ALL
-- across multiple split points
SELECT "qux", TIME(05,45,0), TIME(23,30,0) UNION ALL
-- no split point
SELECT "quz1", TIME(23,30,0), TIME(23,45,0) UNION ALL
-- no split point
SELECT "quz2", TIME(02,0,0), TIME(05,59,0)
),
breaks AS (
SELECT break FROM UNNEST([TIME(6,0,0), TIME(22,0,0)]) break
),
temp AS (
SELECT
id,
start_time,
end_time,
ARRAY_AGG(break ORDER BY break) AS break
FROM `project.dataset.table` CROSS JOIN breaks
WHERE
-- [EDIT] >= instead of > to include rows that starts from split point (`fuz`)
break >= start_time
-- [EDIT] <= instead of < to include rows that ends at split (`foo`)
AND break <= end_time
GROUP BY id, start_time, end_time
)
SELECT
id,
point AS start_time,
IFNULL(next_point, point) AS end_time
FROM (
SELECT
id,
point,
LEAD(point) OVER(PARTITION BY id ORDER BY point) AS next_point
FROM
temp,
UNNEST(ARRAY_CONCAT([start_time], break, [end_time])) aS point
)
WHERE next_point IS NOT NULL
-- [EDIT] Remove data with same value as start_time/end time (generated from rows that starts from split point or ends at split point, foo/fuz)
AND point != next_point
-- [EDIT] temp table does not handle ids that does not cross to any split point, adding them from main table (`quz1` and `quz2`)
UNION ALL SELECT
id,
start_time,
end_time
FROM `project.dataset.table`
WHERE id NOT IN (SELECT id FROM temp)
这是结果
+-----+------+------------+----------+---+
| Row | id | start_time | end_time | |
+-----+------+------------+----------+---+
| 1 | bar | 18:00:00 | 22:00:00 | |
| 2 | bar | 22:00:00 | 23:00:00 | |
| 3 | foo | 21:45:00 | 22:00:00 | |
| 4 | fuz | 22:00:00 | 23:30:00 | |
| 5 | qux | 05:45:00 | 06:00:00 | |
| 6 | qux | 06:00:00 | 22:00:00 | |
| 7 | qux | 22:00:00 | 23:30:00 | |
| 8 | quz1 | 23:30:00 | 23:45:00 | |
| 9 | quz2 | 02:00:00 | 05:59:00 | |
+-----+------+------------+----------+---+
修改强>
如果创建相同的输入数据作为表(使用目标表查询)
-- across split point
SELECT "bar" AS id, TIME(18,0,0) AS start_time, TIME(23,0,0) AS end_time UNION ALL
-- end at split point
SELECT "foo", TIME(21,45,0), TIME(22,0,0) UNION ALL
-- start from split point
SELECT "fuz", TIME(22,0,0), TIME(23,30,0) UNION ALL
-- across multiple split points
SELECT "qux", TIME(05,45,0), TIME(23,30,0) UNION ALL
-- no split point
SELECT "quz1", TIME(23,30,0), TIME(23,45,0) UNION ALL
-- no split point
SELECT "quz2", TIME(02,0,0), TIME(05,59,0)
然后我使用相同的表作为主查询的输入
`project.dataset.table` AS (
SELECT * FROM `myproject.mydataset.test`
),
执行主查询我收到以下错误
Error: LEFT OUTER JOIN cannot be used without a condition that is an equality of fields from both sides of the join.