大查询分析函数提高查询性能

时间:2017-07-07 16:45:12

标签: google-bigquery

对于大型查询和在100M点数据集上达到超时的新手。我试图找到我们在0(停止)附近达到一致的一系列值的点和我们始终高于0(开始)的点。

我保存了正在连接的子查询,它确定了启动文件时间到它自己的数据集,但这没有帮助。 (通过多个“文件”增加秒数。

导致问题的部分是先前pts和下一个pts的初始聚合。

WITH test AS
 (SELECT 'A' as ACM, CAST('2017-01-01' AS DATE) as file_date, CAST('10:10:10' AS TIME) as file_time , 0.0 as value, 0.1 as seconds
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.2 #start
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 2000, 0.3
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 1000, 0.4
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.5
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -1000, 0.6
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -2000, 0.7
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.8 #stop
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.9
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.0 #start
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.1
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.2
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 2000, 1.3
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.4
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.5
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -1000, 1.6
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -2000, 1.7
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.8
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.9
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 2000, 2.0
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 1000, 2.1 
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.2 #stop
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 20, 2.3
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.4
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.1
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.2 #start
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 2000, 0.3
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 1000, 0.4
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.5
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -1000, 0.6
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -2000, 0.7
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.8 #stop
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.9
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.0 #start
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.1
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.2
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 2000, 1.3
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.4
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.5
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -1000, 1.6
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -2000, 1.7
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.8
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.9
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 2000, 2.0
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 1000, 2.1 
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.2 #stop
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 20, 2.3
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.4 )
SELECT 
  acm,
  file_date,
  start_file_time,
  file_times,
  agg_sec as start_stop
FROM (
  SELECT 
    acm,
    file_date,
    start_file_time,
    file_times,
    ARRAY_AGG(kind) OVER w AS agg_kind, 
    ARRAY_AGG(seconds)  OVER w AS agg_sec
  FROM (
    SELECT
      acm,
      file_date,
      start_file_time,    
      ARRAY(SELECT DISTINCT x FROM UNNEST(file_times) as x) AS file_times,
      seconds, 
      CASE 
      WHEN (ABS(prev_val) < 50 and ABS(next_val) >= 50 and next_avg >= 50 and prev_avg < 50 ) THEN 'start'
      WHEN (ABS(next_val) < 50 and ABS(prev_val) >= 50 and prev_avg >= 50 and next_avg < 50 ) THEN 'stop'
      END as kind,
      prev_val, next_val, prev_avg, next_avg
    FROM (
      SELECT 
        s.acm as acm,
        s.file_date as file_date,
        s.start_file_time as start_file_time,
        seconds, 
        value,
        ARRAY_AGG(s.file_time) OVER (PARTITION BY s.acm, s.file_date, s.start_file_time) as file_times,
        AVG(ABS(value)) OVER prev as prev_avg,
        NTH_VALUE(value, 2) OVER prev as prev_val,
        AVG(ABS(value)) OVER next as next_avg,
        NTH_VALUE(value, 2) OVER next as next_val
      FROM test v
      JOIN (
        SELECT 
          acm,
          file_date,
          file_time,
          TIME_SUB(file_time, INTERVAL CAST(FLOOR(MIN(seconds)) AS INT64) SECOND) as start_file_time
        FROM test
        GROUP BY acm, file_date, file_time
      ) s ON s.acm = v.acm AND s.file_date = v.file_date AND s.file_time = v.file_time
      WINDOW prev AS (PARTITION BY s.acm, s.file_date, s.start_file_time ORDER BY seconds ROWS 2 PRECEDING), next AS (PARTITION BY s.acm, s.file_date, s.start_file_time ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)
      )
    WHERE value = 0)
  WHERE kind IN ('start', 'stop')
  WINDOW w AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS 1 PRECEDING))
WHERE ARRAY_LENGTH(agg_kind) = 2 AND agg_kind[ORDINAL(1)] = 'start' AND agg_kind[ORDINAL(2)] = 'stop'
;   

2 个答案:

答案 0 :(得分:1)

检查以下版本是否会产生影响 我尽可能地保留原始代码

  
#standardSQL
WITH test AS
 (SELECT 'A' AS ACM, CAST('2017-01-01' AS DATE) AS file_date, CAST('10:10:10' AS TIME) AS file_time , 0.0 AS value, 0.1 AS seconds
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.2 #start
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 2000, 0.3
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 1000, 0.4
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.5
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -1000, 0.6
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -2000, 0.7
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.8 #stop
  UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.9
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.0 #start
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.1
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.2
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 2000, 1.3
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.4
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.5
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -1000, 1.6
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -2000, 1.7
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.8
  UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.9
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 2000, 2.0
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 1000, 2.1 
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.2 #stop
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 20, 2.3
  UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.4
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.1
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.2 #start
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 2000, 0.3
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 1000, 0.4
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.5
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -1000, 0.6
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -2000, 0.7
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.8 #stop
  UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.9
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.0 #start
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.1
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.2
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 2000, 1.3
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.4
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.5
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -1000, 1.6
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -2000, 1.7
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.8
  UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.9
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 2000, 2.0
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 1000, 2.1 
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.2 #stop
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 20, 2.3
  UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.4 
), temp1 AS (
  SELECT acm, file_date, value, seconds, 
   TIME_SUB(file_time, INTERVAL CAST(FLOOR(seconds) AS INT64) SECOND) AS start_file_time
  FROM test
), temp2 AS (
  SELECT 
    acm, file_date, start_file_time, seconds, 
    AVG(ABS(value)) OVER prev AS prev_avg,
    NTH_VALUE(value, 2) OVER prev AS prev_val,
    AVG(ABS(value)) OVER next AS next_avg,
    NTH_VALUE(value, 2) OVER next AS next_val
  FROM temp1 WINDOW 
    prev AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS 2 PRECEDING), 
    next AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)
), temp3 AS (
  SELECT
    acm, file_date, start_file_time, seconds, 
    CASE 
      WHEN (ABS(prev_val) < 50 AND ABS(next_val) >= 50 AND next_avg >= 50 AND prev_avg < 50 ) THEN 'start'
      WHEN (ABS(next_val) < 50 AND ABS(prev_val) >= 50 AND prev_avg >= 50 AND next_avg < 50 ) THEN 'stop'
    END AS kind
  FROM temp2
), temp4 AS (
  SELECT *, 
    COUNTIF(kind = 'start') OVER (PARTITION BY acm, file_date, start_file_time ORDER BY seconds) + 
    COUNTIF(kind = 'stop') OVER (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp
  FROM temp3
)
SELECT 
  acm, file_date, start_file_time, 
  MIN(seconds) AS start_seconds, 
  MAX(seconds) AS stop_seconds   
FROM temp4
GROUP BY acm, file_date, start_file_time, grp
HAVING MIN(kind) != MAX(kind)
-- ORDER BY 1, 2, 3, 4

答案 1 :(得分:1)

希望此查询能够为您提供所需的结果,并且能够成功处理您的数据集:

SELECT
  * EXCEPT(file_data),
  ARRAY(SELECT STRUCT(seconds, kind) FROM UNNEST(file_data) WHERE kind IS NOT NULL) file_data
FROM(
  SELECT
    ACM,
    file_date,
    start_file_time,
    ARRAY(SELECT DISTINCT file_time FROM UNNEST(file_data)) file_times,
    ARRAY(SELECT STRUCT(seconds, IF(value = 0, (CASE WHEN ABS(NTH_VALUE(value, 2) OVER(prev)) < 50 AND ABS(NTH_VALUE(value, 2) OVER(next)) >= 50 AND AVG(ABS(value)) OVER(next) >= 50 and AVG(ABS(value)) OVER(prev) < 50 THEN 'start'
                                                    WHEN ABS(NTH_VALUE(value, 2) OVER(next)) < 50 AND ABS(NTH_VALUE(value, 2) OVER(prev)) >= 50 AND AVG(ABS(value)) OVER(prev) >= 50 and AVG(ABS(value)) OVER(next) < 50 THEN 'stop' END), NULL) as kind)
          FROM UNNEST(file_data)  WINDOW prev AS (ORDER BY seconds ROWS 2 PRECEDING), next as(ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)) file_data
  FROM(
    SELECT
      ACM,
      file_date,
      TIME_SUB(file_time, INTERVAL CAST(FLOOR(seconds) AS INT64) SECOND) AS start_file_time,
      ARRAY_AGG(STRUCT(file_time, value, seconds)) file_data
    FROM test
    GROUP BY ACM, file_date, start_file_time
   )
 )

其结果正如您在test数据中所描述的“开始”和“停止”。

要做的几点说明:

  • 我避免了昂贵的JOIN操作。
  • 尽可能多地使用ARRAY和STRUCT,这不仅提高了存储效率,而且由于只处理了所需数据而提高了查询性能,即无需处理重复数据。
  • 只有2个WINDOW条款,每个条款都在对应的ARRAY结构中使用,从而提升了性能。请注意这是可能的,因为我聚合了STRUCT的ARRAY中的所有内容,因此不需要更复杂的窗口子句,因为数据已经“整理出来”。
  • 此查询中没有数据重复。
  • 请注意,现在结果略有不同,我建议使用这个新结构,因为它在数据存储和进一步处理方面更有效。

请告诉我这是否适合您。