我的输入数据集:
ID store_id count date_time
x 1 50 1/1/2017
x 2 20 1/2/2017
x 3 30 1/1/2018
y 1 70 1/1/2017
y 2 30 2/1/2018
z 1 100 1/1/2019
输出数据集:
ID store_id count date_time
x 3 30 1/1/2018
y 1 70 1/1/2017
z 1 100 1/1/2019
即逻辑是获取count所在的行。 > = 60(对于相同的ID),否则获取日期时间最高的行。
答案 0 :(得分:0)
以下是用于BigQuery标准SQL
#standardSQL
SELECT AS value IF(arr_2[OFFSET(0)].cnt >= 60, arr_2[OFFSET(0)], arr_1[OFFSET(0)])
FROM (
SELECT id,
ARRAY_AGG(t ORDER BY PARSE_DATE('%d/%m/%Y', date_time) DESC LIMIT 1) arr_1,
ARRAY_AGG(t ORDER BY cnt DESC LIMIT 1) arr_2
FROM `project.dataset.table` t
GROUP BY id
)
注意:我假设您的数据格式为dd / mm / yyyy。如果是mm / dd / yyyy-您应该在PARSE_DATE()中使用'%m /%d /%Y'代替
如果将以上内容应用于以下问题中的伪数据
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'x' id, 1 store_id, 50 cnt, '1/1/2017' date_time UNION ALL
SELECT 'x', 2, 20, '1/2/2017' UNION ALL
SELECT 'x', 3, 30, '1/1/2018' UNION ALL
SELECT 'y', 1, 70, '1/1/2017' UNION ALL
SELECT 'y', 2, 30, '2/1/2018' UNION ALL
SELECT 'z', 1, 100, '1/1/2019'
)
SELECT AS value IF(arr_2[OFFSET(0)].cnt >= 60, arr_2[OFFSET(0)], arr_1[OFFSET(0)])
FROM (
SELECT id,
ARRAY_AGG(t ORDER BY PARSE_DATE('%d/%m/%Y', date_time) DESC LIMIT 1) arr_1,
ARRAY_AGG(t ORDER BY cnt DESC LIMIT 1) arr_2
FROM `project.dataset.table` t
GROUP BY id
)
ORDER BY id
结果是
Row id store_id cnt date_time
1 x 3 30 1/1/2018
2 y 1 70 1/1/2017
3 z 1 100 1/1/2019
答案 1 :(得分:0)
这是解决方案使用解析函数
WITH Raw AS (
select 'x' AS ID, 1 AS store_id, 50 AS count, '1/1/2017' AS date_time
union all
select 'x', 2, 20, '1/2/2017'
union all
select 'x', 3, 30, '1/1/2018'
union all
select 'y', 1, 70, '1/1/2017'
union all
select 'y', 2, 30, '2/1/2018'
union all
select 'z', 1, 100, '1/1/2019'
)
select
*
FROM
(select
ID, store_id, count, date_time,
ROW_NUMBER() OVER (PARTITION BY ID ORDER BY PARSE_DATE('%d/%m/%Y', date_time) DESC) AS rowNum,
MAX(count) OVER (PARTITION BY ID) AS maxCount
FROM
Raw)
WHERE
(count >= 60)
OR
(maxCount < 60 AND rowNum = 1)