如何使用LAG()忽略BigQuery中的空值?

时间:2017-04-24 10:51:00

标签: google-bigquery

使用LAG()in BigQuery standard SQL)时,如何跳过NULL值,使得前一个值不是NULL

我在源表中以相同的格式准备了一些示例行,但是进行了模糊处理。在该示例中,它仅适用于没有前一个NULL值的行。具体而言,第3行和第3行。应该为{4}分配'2017-01-25 04:02:36'(与第5行的情况一样),但它们是NULL

这是有道理的。但是,肯定有一种简单的方法可以指定像INGORE_NULLS这样的东西吗?

--TEMP
with example as (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_1' as col_d 
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_3' as col_d))
--TEMP
SELECT col_a, col_b, col_c,
  case when val_1_transposed is null then LAG(val_1_transposed) over (order by col_c) else val_1_transposed end as val_1_transposed,
  case when val_2_transposed is null then LAG(val_2_transposed) over (order by col_c) else val_2_transposed end as val_2_transposed,
  case when val_3_transposed is null then LAG(val_3_transposed) over (order by col_c) else val_3_transposed end as val_3_transposed
FROM (
  SELECT col_a, col_b, col_c,
    MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
    MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
    MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, col_d FROM example) GROUP BY 1,2,3) ORDER BY col_c DESC

enter image description here

3 个答案:

答案 0 :(得分:2)

这里有两个解决方案,详细描述如下:http://sqlmag.com/t-sql/last-non-null-puzzle

我改编了其中一个解决方案,基本上它使用MAX窗口聚合函数来返回到目前为止的最大相关id。通过使用ROWS UNBOUNDED PRECEDING,您可以不断地达到新的MAX级别,而不是延续并替换NULL滞后条目。

--TEMP
with example as (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_1' as col_d 
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_3' as col_d))
--TEMP
SELECT col_a, col_b, col_c,
  case when val_1_transposed is null then LAG(val_1_transposed) over (order by col_c) else val_1_transposed end as val_1_transposed,
  case when val_2_transposed is null then LAG(val_2_transposed) over (order by col_c) else val_2_transposed end as val_2_transposed,
  case when val_3_transposed is null then LAG(val_3_transposed) over (order by col_c) else val_3_transposed end as val_3_transposed,
  MAX(val_2_transposed) OVER( PARTITION BY grp ORDER BY col_a ROWS UNBOUNDED PRECEDING ) as lag_ignored_nulls
FROM (
select *, 
  MAX(CASE WHEN val_2_transposed IS NOT NULL THEN col_a END ) OVER( ORDER BY col_a ROWS UNBOUNDED PRECEDING ) AS grp
            from (
  SELECT col_a, col_b, col_c,
    MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
    MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
    MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, col_d FROM example) GROUP BY 1,2,3)) ORDER BY col_c DESC

enter image description here

答案 1 :(得分:1)

尝试以下

  
#standardSQL
--TEMP
WITH example AS (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 03:19:50') AS col_c, 'val_1' AS col_d 
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 03:19:50') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 03:19:50') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:23') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:23') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:23') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:59') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:59') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:59') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:36') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:36') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:36') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:55') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:55') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 07:16:58') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 07:16:58') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:35:39') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:35:39') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:46:48') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:46:48') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:46:48') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 10:47:48') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 10:47:48') AS col_c, 'val_3' AS col_d))
--TEMP
SELECT 
  col_a, col_b, col_c, 
  (SELECT TIMESTAMP(s) FROM UNNEST(SPLIT(val_1_transposed)) AS s WITH OFFSET pos WHERE pos = 0) AS val_1_transposed, 
  (SELECT TIMESTAMP(s) FROM UNNEST(SPLIT(val_2_transposed)) AS s WITH OFFSET pos WHERE pos = 0) AS val_2_transposed, 
  (SELECT TIMESTAMP(s) FROM UNNEST(SPLIT(val_3_transposed)) AS s WITH OFFSET pos WHERE pos = 0) AS val_3_transposed
FROM (
  SELECT col_a, col_b, col_c,
    CASE WHEN val_1_transposed IS NULL THEN STRING_AGG(CAST(val_1_transposed AS STRING)) OVER (ORDER BY col_c DESC ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) ELSE CAST(val_1_transposed AS STRING) END AS val_1_transposed,
    CASE WHEN val_2_transposed IS NULL THEN STRING_AGG(CAST(val_2_transposed AS STRING)) OVER (ORDER BY col_c DESC ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) ELSE CAST(val_2_transposed AS STRING) END AS val_2_transposed,
    CASE WHEN val_3_transposed IS NULL THEN STRING_AGG(CAST(val_3_transposed AS STRING)) OVER (ORDER BY col_c DESC ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) ELSE CAST(val_3_transposed AS STRING) END AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, 
      MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
      MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
      MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
    FROM (
      SELECT col_a, col_b, col_c, col_d FROM example
    ) GROUP BY 1,2,3
  ) 
)
ORDER BY col_c DESC  

我意识到上面看起来过于沉重 - 下面是稍微重构的版本(现在比原始代码更轻):

#standardSQL
CREATE TEMP FUNCTION GetFirst(list STRING) AS 
((SELECT TIMESTAMP(s) FROM UNNEST(SPLIT(list)) AS s WITH OFFSET pos WHERE pos = 0));
--TEMP
-- dummy data here ...
--TEMP
SELECT col_a, col_b, col_c,
  GetFirst(IFNULL(val_1_transposed, STRING_AGG(val_1_transposed) OVER (lookback))) AS val_1_transposed,
  GetFirst(IFNULL(val_2_transposed, STRING_AGG(val_2_transposed) OVER (lookback))) AS val_2_transposed,
  GetFirst(IFNULL(val_3_transposed, STRING_AGG(val_3_transposed) OVER (lookback))) AS val_3_transposed
FROM (
  SELECT col_a, col_b, col_c, 
    STRING(MAX(IF(col_d = 'val_1', col_c, NULL))) AS val_1_transposed,
    STRING(MAX(IF(col_d = 'val_2', col_c, NULL))) AS val_2_transposed,
    STRING(MAX(IF(col_d = 'val_3', col_c, NULL))) AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, col_d FROM example
  ) GROUP BY 1,2,3
) 
WINDOW lookback AS (ORDER BY col_c DESC ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
ORDER BY col_c DESC

答案 2 :(得分:1)

尝试以下版本
从Pentium10提供的链接是BigQuery采用第二种解决方案 看起来它可以胜过第一个 - 因为它只涉及一个窗​​口聚合函数

  
#standardSQL
--TEMP
-- dummy data here
--TEMP
SELECT 
  col_a, col_b, col_c,
  TIMESTAMP(SUBSTR(MAX(CONCAT(STRING(col_c), STRING(val_1_transposed))) OVER(win), 23)) AS val_1_transposed,  
  TIMESTAMP(SUBSTR(MAX(CONCAT(STRING(col_c), STRING(val_2_transposed))) OVER(win), 23)) AS val_2_transposed, 
  TIMESTAMP(SUBSTR(MAX(CONCAT(STRING(col_c), STRING(val_3_transposed))) OVER(win), 23)) AS val_3_transposed  
FROM (
  SELECT 
    col_a, col_b, col_c, 
    MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
    MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
    MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, col_d FROM example
  ) 
  GROUP BY 1,2,3
)
WINDOW win AS (PARTITION BY col_a, col_b ORDER BY col_c ROWS UNBOUNDED PRECEDING)
ORDER BY col_c DESC