我的桌子如下:
文档:
*= require_tree .
*= require_self
*/
@import "bootstrap-sprockets";
@import "bootstrap";
(And some custom styling here)
我想获得最近30天的得分中位数。
我当前的查询是:
+-----+-------------+-------------------------+
| dId | score | datetime |
+-----+-------------+-------------------------+
| A | 100.0 | 2019-03-08 16:17:34.043 |
| B | 80.5 | 2019-02-15 16:17:34.043 |
| C | 70.1 | 2019-03-08 16:17:34.043 |
+-----+-------------+-------------------------+
我该怎么做?
答案 0 :(得分:1)
以下是用于BigQuery标准SQL
#standardSQL
CREATE TEMP FUNCTION Median(arr ARRAY<INT64>) AS (
IF(MOD(ARRAY_LENGTH(arr), 2) = 1, arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))],
(arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2) - 1)] + arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))]) / 2)
);
SELECT Median(ARRAY_AGG(score ORDER BY score)) Median
FROM `project.dataset.document`
WHERE DATE(dt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
您可以使用问题中的示例数据来进行推文和上面的游戏,如下例所示
#standardSQL
CREATE TEMP FUNCTION Median(arr ARRAY<INT64>) AS (
IF(MOD(ARRAY_LENGTH(arr), 2) = 1, arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))],
(arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2) - 1)] + arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))]) / 2)
);
WITH `project.dataset.document` AS (
SELECT 'A' dId, 100 score, DATETIME '2019-03-08 16:17:34.043' dt UNION ALL
SELECT 'B', 80, '2019-02-15 16:17:34.043' UNION ALL
SELECT 'C', 70, '2019-03-08 16:17:34.043'
)
SELECT Median(ARRAY_AGG(score ORDER BY score)) Median
FROM `project.dataset.document`
WHERE DATE(dt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
有结果
Row Median
1 85.0
请注意,您可以使用CREATE TEMP FUNCTION Median(arr ANY TYPE) AS (...
使其更通用,并接受任何类型的序列
更新
以下示例应适用于NUMERIC
#standardSQL
CREATE TEMP FUNCTION Median(arr ANY TYPE) AS (
IF(MOD(ARRAY_LENGTH(arr), 2) = 1, arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))],
(arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2) - 1)] + arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))]) / 2)
);
WITH `project.dataset.document` AS (
SELECT 'A' dId, CAST(100.0 AS numeric) score, DATETIME '2019-03-08 16:17:34.043' datetime UNION ALL
SELECT 'B', 80.5, '2019-02-15 16:17:34.043' UNION ALL
SELECT 'C', 70.1, '2019-03-08 16:17:34.043'
)
SELECT Median(ARRAY_AGG(CAST(score AS FLOAT64) ORDER BY score)) Median
FROM `project.dataset.document`
WHERE DATE(datetime) >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
更新
好的。发现内部错误的原因-这是由于按数字值排序
因此,最终版本是:
#standardSQL
CREATE TEMP FUNCTION Median(arr ANY TYPE) AS (
IF(MOD(ARRAY_LENGTH(arr), 2) = 1, arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))],
(arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2) - 1)] + arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))]) / 2)
);
WITH `project.dataset.document` AS (
SELECT 'A' dId, CAST(100.0 AS numeric) score, DATETIME '2019-03-08 16:17:34.043' datetime UNION ALL
SELECT 'B', 80.5, '2019-02-15 16:17:34.043' UNION ALL
SELECT 'C', 70.1, '2019-03-08 16:17:34.043'
)
SELECT Median(ARRAY_AGG(score ORDER BY CAST(score AS FLOAT64))) Median
FROM `project.dataset.document`
WHERE DATE(datetime) >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
答案 1 :(得分:0)
您可以使用PERCENTILE_CONT
进行操作。只需使用0.5 PERCENTILE_CONT
子句在上个月筛选的所有分数中找到WHERE
。如果您想以一种原始方式使用它。这是查询...
SELECT
PERCENTILE_CONT(score, 0.5) OVER() AS Median
FROM
`document` d
WHERE
d.datetime >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 30 day)