在BigQuery中找到模式

时间:2015-10-27 14:32:29

标签: google-bigquery

模式是集合中最常出现的值。

我想要像:

SELECT
    t.id as t_id,
    GROUP_CONCAT(t.value) as value_list,
    MODE(t.value) AS value_mode
FROM dataset.table as t
GROUP BY t_id

这样,例如:

t_id    value_list     value_mode
1       2,2,2,3,6,6    2

这是怎么做到的?

编辑:value_list仅用于说明目的。只需要模式

3 个答案:

答案 0 :(得分:1)

对于你的例子,我就是这样解决的:

SELECT x, w mode
FROM (
  SELECT COUNT(*) c, w, ROW_NUMBER() OVER(ORDER BY c DESC) rn, FIRST(x) x
  FROM (
    SELECT SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)
  )
  GROUP BY 2
)
WHERE rn=1

查询中的GROUP_CONCAT:

SELECT gc, w mode
FROM (
  SELECT COUNT(*) c, w, ROW_NUMBER() OVER(ORDER BY c DESC) rn, FIRST(gc) gc
  FROM (
    SELECT GROUP_CONCAT(w) OVER() gc, w
    FROM (FLATTEN((
      SELECT SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)), w)
    )
  )
  GROUP BY 2
)
WHERE rn=1

处理分区:

SELECT tid, gc value_list, w value_mode
FROM (
  SELECT tid, COUNT(*) c, w, ROW_NUMBER() OVER(PARTITION BY tid ORDER BY c DESC) rn, FIRST(gc) gc
  FROM (
    SELECT tid, GROUP_CONCAT(w) OVER(PARTITION BY tid) gc, w
    FROM (FLATTEN((
      SELECT 1 tid, SPLIT(x) w, x FROM (SELECT "2,2,2,3,6,6" x)), w)
    )
  )
  GROUP BY tid, w
)
WHERE rn=1

答案 1 :(得分:1)

select id, value as value_list, v as value_mode
from (
  select 
    id, value, v, 
    count(1) as c, 
    row_number() over(partition by id order by c desc) as top
  from (
    select id, value, split(value) as v 
    from dataset.table 
  )
  group by id, value, v
)
where top = 1

答案 2 :(得分:1)

我经常必须找到各个组的价格模式(例如,长度和安培数)以过滤掉销售价格等。 我通常使用两种方法创建数组和按频率取消嵌套。我要使用的一种方法是LIMIT,另一种方法是使用[OFFSET(0)],以防您要获取第N个值。

两者都包括在下面:

WITH t AS (SELECT 18 AS length, 
'HIGH' as amps, 
99.95 price UNION ALL
SELECT 18,  "HIGH", 99.95 UNION ALL
SELECT 18,  "HIGH", 5.95 UNION ALL
SELECT 18,  "LOW", 33.95 UNION ALL
SELECT 18,  "LOW", 33.95 UNION ALL
SELECT 18,  "LOW", 4.5 UNION ALL
SELECT 3,  "HIGH", 77.95 UNION ALL
SELECT 3,  "HIGH", 77.95 UNION ALL
SELECT 3,  "HIGH", 9.99 UNION ALL
SELECT 3,  "LOW", 44.95 UNION ALL
SELECT 3,  "LOW", 44.95 UNION ALL
SELECT 3,  "LOW", 5.65 
)

SELECT
  length,
  amps,

  -- By Limit
  (SELECT x FROM UNNEST(price_array) x 
    GROUP BY x ORDER BY COUNT(*) DESC LIMIT 1 ) most_freq_price,
  (SELECT x FROM UNNEST(price_array) x 
    GROUP BY x ORDER BY COUNT(*) ASC  LIMIT 1 ) least_freq_price,

  -- By Offset
  ARRAY((SELECT x FROM UNNEST(price_array) x 
    GROUP BY x ORDER BY COUNT(*) DESC))[OFFSET(0)] most_freq_price_offset,
  ARRAY((SELECT x FROM UNNEST(price_array) x 
    GROUP BY x ORDER BY COUNT(*) ASC))[OFFSET(0)] least_freq_price_offset

FROM (
  SELECT 
    length,
    amps,
    ARRAY_AGG(price) price_array
  FROM t
  GROUP BY 1,2
 )