SQL滚动窗口唯一计数

时间:2018-05-04 00:30:39

标签: python sql pandas google-bigquery

pandas中是否存在与此行代码相当的SQL?

假设a是一个DataFrame对象,索引是一个时间列表(包括小时,分钟和秒)。

在这种情况下,除了索引之外,

x只是DataFrame中的另一列。

a.rolling('1h').apply(lambda x: len(np.unique(x))).astype(int)

样本结果:(时间格式为HH:MM:SS)

                X
05:20:19        4   <- 1 (only 1 unique number)
05:20:19        5   <- 2 (4 and 5 are unique) * same time as before
05:37:18        7   <- 3 (4, 5 and 7 are unique)
05:45:14        4   <- 3 (4, 5, and 7)
05:56:04        4   <- 3 (4, 5, and 7)
06:18:48        6   <- 4 (now 4, 5, 6, and 7)
06:48:34        3   <- 3 (only checks past hour, so now 3, 4, 6)
07:52:48        1   <- 1 (only time in past hour, so only 1)

我也正在使用vanilla SQL。

非常感谢!

3 个答案:

答案 0 :(得分:2)

下面的示例适用于BigQuery Standard SQL

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT TIME '05:20:19' t, 4 x UNION ALL
  SELECT TIME '05:37:18', 7 UNION ALL
  SELECT TIME '05:45:14', 4 UNION ALL
  SELECT TIME '05:56:04', 4 UNION ALL
  SELECT TIME '06:18:48', 5 UNION ALL
  SELECT TIME '06:48:34', 3 UNION ALL
  SELECT TIME '07:52:48', 1 
)
SELECT
  t, x, (SELECT COUNT(DISTINCT y) FROM UNNEST(arr) y) uniques
FROM (
  SELECT t, x,
    ARRAY_AGG(x) 
      OVER(ORDER BY TIME_DIFF(t, TIME '00:00:00', SECOND) 
      RANGE BETWEEN 3600 PRECEDING AND CURRENT ROW) arr
  FROM `project.dataset.your_table`
)
-- ORDER BY t  

结果为

Row t           x   uniques  
1   05:20:19    4   1    
2   05:37:18    7   2    
3   05:45:14    4   2    
4   05:56:04    4   2    
5   06:18:48    5   3    
6   06:48:34    3   3    
7   07:52:48    1   1    

它使用了您问题中的精确虚拟数据 - 我觉得实际上您没有时间而是TIMESTAMP而不是ORDER BY TIME_DIFF(t, TIME '00:00:00', SECOND)您可能希望使用类似ORDER BY TIMESTAMP_DIFF(t, TIMESTAMP '2000-01-01 00:00:00', SECOND)的内容,以便查询将如下所示

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT TIMESTAMP '2018-01-05 05:20:19' t, 4 x UNION ALL
  SELECT TIMESTAMP '2018-01-05 05:37:18', 7 UNION ALL
  SELECT TIMESTAMP '2018-01-05 05:45:14', 4 UNION ALL
  SELECT TIMESTAMP '2018-01-05 05:56:04', 4 UNION ALL
  SELECT TIMESTAMP '2018-01-05 06:18:48', 5 UNION ALL
  SELECT TIMESTAMP '2018-01-05 06:48:34', 3 UNION ALL
  SELECT TIMESTAMP '2018-01-05 07:52:48', 1 
)
SELECT
  t, x, (SELECT COUNT(DISTINCT y) FROM UNNEST(arr) y) uniques
FROM (
  SELECT t, x,
    ARRAY_AGG(x) 
      OVER(ORDER BY TIMESTAMP_DIFF(t, TIMESTAMP '2000-01-01 00:00:00', SECOND) 
      RANGE BETWEEN 3600 PRECEDING AND CURRENT ROW) arr
  FROM `project.dataset.your_table`
)
-- ORDER BY t

结果为

Row t                           x   uniques  
1   2018-01-05 05:20:19.000 UTC 4   1    
2   2018-01-05 05:37:18.000 UTC 7   2    
3   2018-01-05 05:45:14.000 UTC 4   2    
4   2018-01-05 05:56:04.000 UTC 4   2    
5   2018-01-05 06:18:48.000 UTC 5   3    
6   2018-01-05 06:48:34.000 UTC 3   3    
7   2018-01-05 07:52:48.000 UTC 1   1    
  

更新 - 以下是&#34;技巧&#34;满足您的额外新要求

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT TIME '05:20:19' t, 4 x UNION ALL
  SELECT TIME '05:20:19', 5 UNION ALL
  SELECT TIME '05:37:18', 7 UNION ALL
  SELECT TIME '05:45:14', 4 UNION ALL
  SELECT TIME '05:56:04', 4 UNION ALL
  SELECT TIME '06:18:48', 6 UNION ALL
  SELECT TIME '06:48:34', 3 UNION ALL
  SELECT TIME '07:52:48', 1   
)
SELECT
  t, x, (SELECT COUNT(DISTINCT y) FROM UNNEST(arr) y) uniques
FROM (
  SELECT t, x,
    ARRAY_AGG(x) 
      OVER(ORDER BY TIME_DIFF(t, TIME '00:00:00', MILLISECOND) + 1000 * RAND() 
      RANGE BETWEEN 3600000 PRECEDING AND CURRENT ROW) arr
  FROM `project.dataset.your_table`
)
-- ORDER BY t  

结果为

Row t           x   uniques  
1   05:20:19    5   1    
2   05:20:19    4   2    
3   05:37:18    7   3    
4   05:45:14    4   3    
5   05:56:04    4   3    
6   06:18:48    6   4    
7   06:48:34    3   3    
8   07:52:48    1   1    
  

又一次更新:o)

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT TIME '05:20:19' t, 4 x UNION ALL
  SELECT TIME '05:20:19', 5 UNION ALL
  SELECT TIME '05:37:18', 7 UNION ALL
  SELECT TIME '05:45:14', 4 UNION ALL
  SELECT TIME '05:56:04', 4 UNION ALL
  SELECT TIME '06:18:48', 6 UNION ALL
  SELECT TIME '06:48:34', 3 UNION ALL
  SELECT TIME '07:52:48', 1   
)
SELECT
  t, x, (SELECT COUNT(DISTINCT y) FROM UNNEST(arr) y) uniques
FROM (
  SELECT t, x,
    ARRAY_AGG(x) 
      OVER(ORDER BY ms 
      RANGE BETWEEN 3600000 PRECEDING AND CURRENT ROW) arr
  FROM (
    SELECT t, x, TIME_DIFF(t, TIME '00:00:00', MILLISECOND) + 1000 * RAND() ms
    FROM `project.dataset.your_table`
  )
)
-- ORDER BY t  

答案 1 :(得分:0)

使用时间范围关系作为连接条件,自己加入表。这是MySQL语法:

SELECT t1.time, t1.x, COUNT(DISTINCT t2.x)
FROM yourTable AS t1
JOIN yourTable AS t2 ON t2.time BETWEEN DATE_SUB(t1.time, INTERVAL 1 HOUR) AND t1.time
GROUP BY t1.time, t1.x

DEMO

答案 2 :(得分:0)

对于MySQL,您可以使用子查询,见下文:

SELECT t1.date
,      (SELECT count(DISTINCT t2.x) FROM mytable AS t2
           WHERE  t2.date <= t1.date
           AND    t2.date > DATE_SUB(t1.date, INTERVAL 1 HOUR)    
       ) AS uniq_rolling_count_of_x 
FROM mytable AS t1
ORDER BY 1
;