BigQuery:如何计算最近2天的不重复访客人数

时间:2018-12-31 07:28:56

标签: sql google-bigquery

我想计算每个日期最近2天的唯一用户数。

第一个查询:-我尝试使用 CASE 语句为我提供当天的用户计数,这是预期的结果,即使我尝试使用窗口功能。

我知道一种通过自我联接的替代解决方案(已经提到作为第二个查询),该解决方案可以为我提供期望的正确答案,但是我想在单个查询中完成。

在单个查询中执行的原因是希望减少处理后的数据大小,如果我进行自我联接,它将读取完整的表两次,并且原始表大小为多TB。

SELECT
(CASE WHEN dt BETWEEN DATE_SUB(dt, INTERVAL 1 DAY) AND dt THEN 
CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) END) AS Date_range,
COUNT(DISTINCT (CASE WHEN dt BETWEEN DATE_SUB(dt, INTERVAL 1 DAY) AND dt THEN Visitor_Name END)) AS Visitor_Count
FROM
(SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
UNION ALL
SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
UNION ALL
SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name)
GROUP BY Date_range
ORDER BY Date_range;

解决方案:

SELECT
    (CASE WHEN a.dt BETWEEN DATE_SUB(b.dt, INTERVAL 1 DAY) AND b.dt THEN 
    CONCAT(CAST(DATE_SUB(b.dt, INTERVAL 1 DAY) AS STRING), '::', CAST(b.dt AS STRING)) END) AS Date_range,
    COUNT(DISTINCT (CASE WHEN a.dt BETWEEN DATE_SUB(b.dt, INTERVAL 1 DAY) AND b.dt THEN a.Visitor_Name END)) AS Visitor_Count
FROM


    (SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name) AS a


    INNER JOIN


    (SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name) AS b

    ON (a.dt <= b.dt)
GROUP BY Date_range
ORDER BY Date_range;

2 个答案:

答案 0 :(得分:2)

您可以通过在汇总之前“乘以”记录来实现此目的。也就是说,给每个用户一个记录该用户应该计算的每个日期的记录。

这里是一个例子:

with t as (
      SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name
     )
select dt, count(distinct visitor_name) as num_visitors
from (select distinct date_add(dt, interval inc day) as dt, visitor_name
      from t CROSS JOIN
           (select 0 as inc UNION ALL
            SELECT 1
           ) x
     ) t
group by t.dt
order by t.dt;

答案 1 :(得分:0)

以下是用于BigQuery标准SQL

#standardSQL
SELECT CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) Date_range, 
  ANY_VALUE((SELECT COUNT(DISTINCT visitor) FROM UNNEST(arr_visitors) visitor)) AS Visitor_Count
FROM (
  SELECT dt, 
    ARRAY_AGG(visitor_name) OVER(ORDER BY UNIX_DATE(dt) RANGE BETWEEN 1 PRECEDING AND CURRENT ROW) AS arr_visitors
  FROM `project.dataset.your_table`
)
GROUP BY Date_range 

您可以使用以下问题中的伪数据来测试/玩游戏

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name UNION ALL
  SELECT '2018-01-01', 'B' UNION ALL
  SELECT '2018-01-01', 'C' UNION ALL
  SELECT '2018-01-01', 'D' UNION ALL
  SELECT '2018-01-02', 'B' UNION ALL
  SELECT '2018-01-02', 'C' UNION ALL
  SELECT '2018-01-02', 'E' UNION ALL
  SELECT '2018-01-03', 'A' UNION ALL
  SELECT '2018-01-03', 'P' UNION ALL
  SELECT '2018-01-04', 'A' UNION ALL
  SELECT '2018-01-04', 'C' UNION ALL
  SELECT '2018-01-05', 'D' UNION ALL
  SELECT '2018-01-05', 'B' UNION ALL
  SELECT '2018-01-05', 'B' UNION ALL
  SELECT '2018-01-06', 'P' 
)
SELECT CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) Date_range, 
  ANY_VALUE((SELECT COUNT(DISTINCT visitor) FROM UNNEST(arr_visitors) visitor)) AS Visitor_Count
FROM (
  SELECT dt, 
    ARRAY_AGG(visitor_name) OVER(ORDER BY UNIX_DATE(dt) RANGE BETWEEN 1 PRECEDING AND CURRENT ROW) AS arr_visitors
  FROM `project.dataset.your_table`
)
GROUP BY Date_range 
ORDER BY Date_range   

有结果

Row Date_range              Visitor_Count    
1   2017-12-31::2018-01-01  4    
2   2018-01-01::2018-01-02  5    
3   2018-01-02::2018-01-03  5    
4   2018-01-03::2018-01-04  3    
5   2018-01-04::2018-01-05  4    
6   2018-01-05::2018-01-06  3