对于大型查询和在100M点数据集上达到超时的新手。我试图找到我们在0(停止)附近达到一致的一系列值的点和我们始终高于0(开始)的点。
我保存了正在连接的子查询,它确定了启动文件时间到它自己的数据集,但这没有帮助。 (通过多个“文件”增加秒数。
导致问题的部分是先前pts和下一个pts的初始聚合。
WITH test AS
(SELECT 'A' as ACM, CAST('2017-01-01' AS DATE) as file_date, CAST('10:10:10' AS TIME) as file_time , 0.0 as value, 0.1 as seconds
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.2 #start
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 2000, 0.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 1000, 0.4
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.5
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -1000, 0.6
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -2000, 0.7
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.8 #stop
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.9
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.0 #start
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.1
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.2
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 2000, 1.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.4
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.5
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -1000, 1.6
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -2000, 1.7
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.8
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.9
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 2000, 2.0
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 1000, 2.1
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.2 #stop
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 20, 2.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.2 #start
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 2000, 0.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 1000, 0.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.5
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -1000, 0.6
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -2000, 0.7
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.8 #stop
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.9
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.0 #start
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.2
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 2000, 1.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.5
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -1000, 1.6
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -2000, 1.7
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.8
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.9
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 2000, 2.0
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 1000, 2.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.2 #stop
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 20, 2.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.4 )
SELECT
acm,
file_date,
start_file_time,
file_times,
agg_sec as start_stop
FROM (
SELECT
acm,
file_date,
start_file_time,
file_times,
ARRAY_AGG(kind) OVER w AS agg_kind,
ARRAY_AGG(seconds) OVER w AS agg_sec
FROM (
SELECT
acm,
file_date,
start_file_time,
ARRAY(SELECT DISTINCT x FROM UNNEST(file_times) as x) AS file_times,
seconds,
CASE
WHEN (ABS(prev_val) < 50 and ABS(next_val) >= 50 and next_avg >= 50 and prev_avg < 50 ) THEN 'start'
WHEN (ABS(next_val) < 50 and ABS(prev_val) >= 50 and prev_avg >= 50 and next_avg < 50 ) THEN 'stop'
END as kind,
prev_val, next_val, prev_avg, next_avg
FROM (
SELECT
s.acm as acm,
s.file_date as file_date,
s.start_file_time as start_file_time,
seconds,
value,
ARRAY_AGG(s.file_time) OVER (PARTITION BY s.acm, s.file_date, s.start_file_time) as file_times,
AVG(ABS(value)) OVER prev as prev_avg,
NTH_VALUE(value, 2) OVER prev as prev_val,
AVG(ABS(value)) OVER next as next_avg,
NTH_VALUE(value, 2) OVER next as next_val
FROM test v
JOIN (
SELECT
acm,
file_date,
file_time,
TIME_SUB(file_time, INTERVAL CAST(FLOOR(MIN(seconds)) AS INT64) SECOND) as start_file_time
FROM test
GROUP BY acm, file_date, file_time
) s ON s.acm = v.acm AND s.file_date = v.file_date AND s.file_time = v.file_time
WINDOW prev AS (PARTITION BY s.acm, s.file_date, s.start_file_time ORDER BY seconds ROWS 2 PRECEDING), next AS (PARTITION BY s.acm, s.file_date, s.start_file_time ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)
)
WHERE value = 0)
WHERE kind IN ('start', 'stop')
WINDOW w AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS 1 PRECEDING))
WHERE ARRAY_LENGTH(agg_kind) = 2 AND agg_kind[ORDINAL(1)] = 'start' AND agg_kind[ORDINAL(2)] = 'stop'
;
答案 0 :(得分:1)
检查以下版本是否会产生影响 我尽可能地保留原始代码
#standardSQL
WITH test AS
(SELECT 'A' AS ACM, CAST('2017-01-01' AS DATE) AS file_date, CAST('10:10:10' AS TIME) AS file_time , 0.0 AS value, 0.1 AS seconds
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.2 #start
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 2000, 0.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 1000, 0.4
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.5
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -1000, 0.6
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -2000, 0.7
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.8 #stop
UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.9
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.0 #start
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.1
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.2
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 2000, 1.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.4
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.5
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -1000, 1.6
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -2000, 1.7
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.8
UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.9
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 2000, 2.0
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 1000, 2.1
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.2 #stop
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 20, 2.3
UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.2 #start
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 2000, 0.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 1000, 0.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.5
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -1000, 0.6
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -2000, 0.7
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.8 #stop
UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.9
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.0 #start
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.2
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 2000, 1.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.4
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.5
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -1000, 1.6
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -2000, 1.7
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.8
UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.9
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 2000, 2.0
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 1000, 2.1
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.2 #stop
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 20, 2.3
UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.4
), temp1 AS (
SELECT acm, file_date, value, seconds,
TIME_SUB(file_time, INTERVAL CAST(FLOOR(seconds) AS INT64) SECOND) AS start_file_time
FROM test
), temp2 AS (
SELECT
acm, file_date, start_file_time, seconds,
AVG(ABS(value)) OVER prev AS prev_avg,
NTH_VALUE(value, 2) OVER prev AS prev_val,
AVG(ABS(value)) OVER next AS next_avg,
NTH_VALUE(value, 2) OVER next AS next_val
FROM temp1 WINDOW
prev AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS 2 PRECEDING),
next AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)
), temp3 AS (
SELECT
acm, file_date, start_file_time, seconds,
CASE
WHEN (ABS(prev_val) < 50 AND ABS(next_val) >= 50 AND next_avg >= 50 AND prev_avg < 50 ) THEN 'start'
WHEN (ABS(next_val) < 50 AND ABS(prev_val) >= 50 AND prev_avg >= 50 AND next_avg < 50 ) THEN 'stop'
END AS kind
FROM temp2
), temp4 AS (
SELECT *,
COUNTIF(kind = 'start') OVER (PARTITION BY acm, file_date, start_file_time ORDER BY seconds) +
COUNTIF(kind = 'stop') OVER (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp
FROM temp3
)
SELECT
acm, file_date, start_file_time,
MIN(seconds) AS start_seconds,
MAX(seconds) AS stop_seconds
FROM temp4
GROUP BY acm, file_date, start_file_time, grp
HAVING MIN(kind) != MAX(kind)
-- ORDER BY 1, 2, 3, 4
答案 1 :(得分:1)
希望此查询能够为您提供所需的结果,并且能够成功处理您的数据集:
SELECT
* EXCEPT(file_data),
ARRAY(SELECT STRUCT(seconds, kind) FROM UNNEST(file_data) WHERE kind IS NOT NULL) file_data
FROM(
SELECT
ACM,
file_date,
start_file_time,
ARRAY(SELECT DISTINCT file_time FROM UNNEST(file_data)) file_times,
ARRAY(SELECT STRUCT(seconds, IF(value = 0, (CASE WHEN ABS(NTH_VALUE(value, 2) OVER(prev)) < 50 AND ABS(NTH_VALUE(value, 2) OVER(next)) >= 50 AND AVG(ABS(value)) OVER(next) >= 50 and AVG(ABS(value)) OVER(prev) < 50 THEN 'start'
WHEN ABS(NTH_VALUE(value, 2) OVER(next)) < 50 AND ABS(NTH_VALUE(value, 2) OVER(prev)) >= 50 AND AVG(ABS(value)) OVER(prev) >= 50 and AVG(ABS(value)) OVER(next) < 50 THEN 'stop' END), NULL) as kind)
FROM UNNEST(file_data) WINDOW prev AS (ORDER BY seconds ROWS 2 PRECEDING), next as(ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)) file_data
FROM(
SELECT
ACM,
file_date,
TIME_SUB(file_time, INTERVAL CAST(FLOOR(seconds) AS INT64) SECOND) AS start_file_time,
ARRAY_AGG(STRUCT(file_time, value, seconds)) file_data
FROM test
GROUP BY ACM, file_date, start_file_time
)
)
其结果正如您在test
数据中所描述的“开始”和“停止”。
要做的几点说明:
JOIN
操作。WINDOW
条款,每个条款都在对应的ARRAY结构中使用,从而提升了性能。请注意这是可能的,因为我聚合了STRUCT的ARRAY中的所有内容,因此不需要更复杂的窗口子句,因为数据已经“整理出来”。请告诉我这是否适合您。