BigQuery分析功能

时间:2019-07-02 23:38:32

标签: google-bigquery

我的输入数据集:

ID  store_id    count           date_time
x   1             50            1/1/2017
x   2             20            1/2/2017
x   3             30            1/1/2018
y   1             70            1/1/2017
y   2             30            2/1/2018
z   1             100           1/1/2019

输出数据集:

ID  store_id    count           date_time
x   3             30            1/1/2018
y   1             70            1/1/2017
z   1             100           1/1/2019

即逻辑是获取count所在的行。 > = 60(对于相同的ID),否则获取日期时间最高的行。

2 个答案:

答案 0 :(得分:0)

以下是用于BigQuery标准SQL

#standardSQL
SELECT AS value IF(arr_2[OFFSET(0)].cnt >= 60, arr_2[OFFSET(0)], arr_1[OFFSET(0)])
FROM (
  SELECT id,
    ARRAY_AGG(t ORDER BY PARSE_DATE('%d/%m/%Y', date_time) DESC LIMIT 1) arr_1,
    ARRAY_AGG(t ORDER BY cnt DESC LIMIT 1) arr_2
  FROM `project.dataset.table` t
  GROUP BY id
)

注意:我假设您的数据格式为dd / mm / yyyy。如果是mm / dd / yyyy-您应该在PARSE_DATE()中使用'%m /%d /%Y'代替

如果将以上内容应用于以下问题中的伪数据

#standardSQL
WITH `project.dataset.table` AS (
  SELECT 'x' id, 1 store_id, 50 cnt, '1/1/2017' date_time UNION ALL
  SELECT 'x', 2, 20, '1/2/2017' UNION ALL
  SELECT 'x', 3, 30, '1/1/2018' UNION ALL
  SELECT 'y', 1, 70, '1/1/2017' UNION ALL
  SELECT 'y', 2, 30, '2/1/2018' UNION ALL
  SELECT 'z', 1, 100, '1/1/2019' 
)
SELECT AS value IF(arr_2[OFFSET(0)].cnt >= 60, arr_2[OFFSET(0)], arr_1[OFFSET(0)])
FROM (
  SELECT id,
    ARRAY_AGG(t ORDER BY PARSE_DATE('%d/%m/%Y', date_time) DESC LIMIT 1) arr_1,
    ARRAY_AGG(t ORDER BY cnt DESC LIMIT 1) arr_2
  FROM `project.dataset.table` t
  GROUP BY id
)
ORDER BY id   

结果是

Row id  store_id    cnt date_time    
1   x   3           30  1/1/2018     
2   y   1           70  1/1/2017     
3   z   1           100 1/1/2019     

答案 1 :(得分:0)

这是解决方案使用解析函数

WITH Raw AS (
  select 'x' AS ID, 1 AS store_id, 50 AS count, '1/1/2017' AS date_time
  union all
  select 'x', 2, 20, '1/2/2017'
  union all
  select 'x', 3, 30, '1/1/2018'
  union all
  select 'y', 1, 70, '1/1/2017'
  union all
  select 'y', 2, 30, '2/1/2018'
  union all
  select 'z', 1, 100, '1/1/2019'
)
select
  *
FROM
  (select
    ID, store_id, count, date_time,
    ROW_NUMBER() OVER (PARTITION BY ID ORDER BY PARSE_DATE('%d/%m/%Y', date_time) DESC) AS rowNum,
    MAX(count) OVER (PARTITION BY ID) AS maxCount
  FROM
    Raw)
WHERE
  (count >= 60)
OR
  (maxCount < 60 AND rowNum = 1)