BigQuery Count only last item by timestamp

时间:2019-01-18 18:22:01

标签: google-bigquery

I'm attempting to join to tables and count how many "Checklists" are completed.

What you'll notice with id: 1, is that:

  • 01-09: It was marked completed
  • 01-10: It was marked uncompleted
  • 01-11: It was marked completed again

Because of this my count is off by 1. I'm looking to only choose the last completed action per id. The actual response should be:

| Worksite   | Count |
| ---------- | ----- |
| worksite_1 | 4     |
| worksite_2 | 2     |

EDIT

I figured out how to do it if I wanted to separate the count by date. But I haven't figured out how I would do it if I wanted a TOTAL. This seems to work for by date:

SELECT 
    DATE(ChecklistCompletions.ts) AS `DATE`,
    Checklists.worksite_id AS `Worksite`,
    COUNT(DISTINCT (CASE WHEN ChecklistCompletions.completed = 1 THEN 1 END)) AS `Count`
FROM Checklists
LEFT JOIN ChecklistCompletions
on Checklists.id = ChecklistCompletions.id
GROUP BY `Worksite`, `DATE`
ORDER BY `DATE` DESC

Is this something that is possible to do? Any help would be greatly appreciated.

NOTE: I'm only using MySQL as a playground. I'm looking for a solution in BigQuery Standard SQL.


Schema (MySQL v5.7)

CREATE TABLE Checklists
    (`id` varchar(55), `uid` varchar(55), `worksite_id`  varchar(55), `ts` datetime)
;

CREATE TABLE ChecklistCompletions
    (`id` varchar(55), `uid` varchar(55), `completed` tinyint(1), `ts` datetime)
;

INSERT INTO ChecklistCompletions
    (`id`, `uid`, `completed`, `ts`)

VALUES
  ("1",     "u12345",   1, '2019-01-09 00:00:00'),
  ("1",     "u12345",   0, '2019-01-10 00:00:00'),
  ("1",     "u12345",   1, '2019-01-11 00:00:00'),
  ("2",     "u12345",   0, '2019-01-13 00:00:00'),
  ("3",     "u12345",   1, '2019-01-12 00:00:00'),
  ("4",     "u12345",   1, '2019-01-13 00:00:00'),
  ("5",     "u12345",   1, '2019-01-12 00:00:00'),
  ("6",     "u12345",   0, '2019-01-17 00:00:00'),
  ("7",     "u1",       1, '2019-01-10 00:00:00'),
  ("8",     "u1",       0, '2019-01-12 00:00:00'),
  ("9",     "u1",       1, '2019-01-15 00:05:00'),
  ("10",    "u1",       0, '2019-01-15 00:00:00')

;

INSERT INTO Checklists
    (`id`, `uid`, `worksite_id`, `ts`)

VALUES
  ("1",     "u12345",   "worksite_1", '2019-01-09 00:00:00'),
  ("2",     "u12345",   "worksite_2", '2019-01-13 00:00:00'),
  ("3",     "u12345",   "worksite_2", '2019-01-12 00:00:00'),
  ("4",     "u12345",   "worksite_1", '2019-01-13 00:00:00'),
  ("5",     "u12345",   "worksite_2", '2019-01-12 00:00:00'),
  ("6",     "u12345",   "worksite_1", '2019-01-17 00:00:00'),
  ("7",     "u1",       "worksite_1", '2019-01-10 00:00:00'),
  ("8",     "u1",       "worksite_1", '2019-01-12 00:00:00'),
  ("9",     "u1",       "worksite_1", '2019-01-15 00:05:00'),
  ("10",    "u1",       "worksite_2", '2019-01-15 00:00:00')
;

Query #1

SELECT 
    Checklists.worksite_id AS `Worksite`,
    COUNT(CASE WHEN ChecklistCompletions.completed = 1 THEN 1 END) AS `Count`
FROM Checklists
LEFT JOIN ChecklistCompletions
on Checklists.id = ChecklistCompletions.id
GROUP BY `Worksite`;

| Worksite   | Count |
| ---------- | ----- |
| worksite_1 | 5     |
| worksite_2 | 2     |

View on DB Fiddle

1 个答案:

答案 0 :(得分:1)

以下是用于BigQuery标准SQL

   
#standardSQL
SELECT Worksite, COUNTIF(completed = 1) completed
FROM (
  SELECT 
      Checklists.worksite_id AS `Worksite`,
      ARRAY_AGG(completed ORDER BY completed DESC LIMIT 1)[OFFSET(0)] completed
  FROM `project.dataset.Checklists` Checklists
  LEFT JOIN `project.dataset.ChecklistCompletions` ChecklistCompletions
  ON Checklists.id = ChecklistCompletions.id
  GROUP BY Checklists.id, Worksite
) GROUP BY worksite

如果将其应用于问题的样本数据,您将得到结果(如预期)

Row Worksite    completed    
1   worksite_1  4    
2   worksite_2  2     

您可以使用下面的

进行测试,操作
#standardSQL
WITH `project.dataset.ChecklistCompletions` AS (
  SELECT "1" id,     "u12345" uid,   1 completed, TIMESTAMP '2019-01-09 00:00:00' ts UNION ALL
  SELECT "1",     "u12345",   0, '2019-01-10 00:00:00' UNION ALL
  SELECT "1",     "u12345",   1, '2019-01-11 00:00:00' UNION ALL
  SELECT "2",     "u12345",   0, '2019-01-13 00:00:00' UNION ALL
  SELECT "3",     "u12345",   1, '2019-01-12 00:00:00' UNION ALL
  SELECT "4",     "u12345",   1, '2019-01-13 00:00:00' UNION ALL
  SELECT "5",     "u12345",   1, '2019-01-12 00:00:00' UNION ALL
  SELECT "6",     "u12345",   0, '2019-01-17 00:00:00' UNION ALL
  SELECT "7",     "u1",       1, '2019-01-10 00:00:00' UNION ALL
  SELECT "8",     "u1",       0, '2019-01-12 00:00:00' UNION ALL
  SELECT "9",     "u1",       1, '2019-01-15 00:05:00' UNION ALL
  SELECT "10",    "u1",       0, '2019-01-15 00:00:00' 
), `project.dataset.Checklists` AS (
  SELECT "1" id,     "u12345" uid,   "worksite_1" worksite_id, TIMESTAMP '2019-01-09 00:00:00' ts UNION ALL
  SELECT "2",     "u12345",   "worksite_2", '2019-01-13 00:00:00' UNION ALL
  SELECT "3",     "u12345",   "worksite_2", '2019-01-12 00:00:00' UNION ALL
  SELECT "4",     "u12345",   "worksite_1", '2019-01-13 00:00:00' UNION ALL
  SELECT "5",     "u12345",   "worksite_2", '2019-01-12 00:00:00' UNION ALL
  SELECT "6",     "u12345",   "worksite_1", '2019-01-17 00:00:00' UNION ALL
  SELECT "7",     "u1",       "worksite_1", '2019-01-10 00:00:00' UNION ALL
  SELECT "8",     "u1",       "worksite_1", '2019-01-12 00:00:00' UNION ALL
  SELECT "9",     "u1",       "worksite_1", '2019-01-15 00:05:00' UNION ALL
  SELECT "10",    "u1",       "worksite_2", '2019-01-15 00:00:00' 
)
SELECT Worksite, COUNTIF(completed = 1) completed
FROM (
  SELECT 
      Checklists.worksite_id AS `Worksite`,
      ARRAY_AGG(completed ORDER BY completed DESC LIMIT 1)[OFFSET(0)] completed
  FROM `project.dataset.Checklists` Checklists
  LEFT JOIN `project.dataset.ChecklistCompletions` ChecklistCompletions
  ON Checklists.id = ChecklistCompletions.id
  GROUP BY Checklists.id, Worksite
) GROUP BY worksite
ORDER BY worksite