我想要一个带有JSON列的表。该JSON列可以包含任意文档。我想根据它们的时间戳合并这些文档,该文档在另一栏中提供。是否可以通过时间戳合并这些JSON文档?
这里是一个示例:
at t3 time {a:"1", b:"2"}
at t2 time {b:"4"}
at t1 time {a:"4", c:"5"}
我想创建{a:"1", b:"2", c:"5"}
作为输出。 BigQuery是否有可能做到这一点?
答案 0 :(得分:1)
也许有更好的方法,而我想到的第一个主意是:
#standardSQL
CREATE TEMPORARY FUNCTION merge_json(json_string STRING)
RETURNS STRING
LANGUAGE js
AS
"""
// TODO 1: split json string with '||||||' to get multiple parts
// . 2: parse each json parts into object
// 3: merge objects in your own way
// fake output, just to demonstrate the idea
var obj = JSON.parse('{"a":"1", "b":"2", "c":"5"}')
return JSON.stringify(obj);
""";
WITH
sample_data AS (
SELECT '{a:"1", b:"2"}' AS json, 1000 AS timestamp
UNION ALL
SELECT '{b:"4"}' AS json, 2000 AS timestamp
UNION ALL
SELECT '{a:"4", c:"5"}' AS json, 1000 AS timestamp )
SELECT timestamp, merge_json(STRING_AGG(json, '||||||')) as joined_json
FROM sample_data
GROUP BY timestamp
输出:
答案 1 :(得分:1)
以下是使用BigQuery标准SQL函数以及您的数据的可能解决方案:
#standardSQL
WITH test AS (
SELECT '{"a":"1", "b":"2"}' AS json, 3 AS t UNION ALL
SELECT '{"b":"4"}' AS json, 2 AS t UNION ALL
SELECT '{"a":"4", "c":"5"}' AS json, 1 AS t
)
SELECT data_row, TO_JSON_STRING(data_row) AS json_row
FROM (
SELECT
ARRAY_TO_STRING(ARRAY_AGG(a IGNORE NULLS ORDER BY t DESC LIMIT 1),'') AS a,
ARRAY_TO_STRING(ARRAY_AGG(b IGNORE NULLS ORDER BY t DESC LIMIT 1),'') AS b,
ARRAY_TO_STRING(ARRAY_AGG(c IGNORE NULLS ORDER BY t DESC LIMIT 1),'') AS c
FROM(
SELECT JSON_EXTRACT_SCALAR(json,'$.a') AS a,
JSON_EXTRACT_SCALAR(json,'$.b') AS b,
JSON_EXTRACT_SCALAR(json,'$.c') AS c,
t
FROM test
)
) AS data_row
请注意,ARRAY_AGG
仅用于查找每个文档的最新的无NULL值,因此它将与STRING
一起转换为ARRAY_TO_STRING
。该查询的结果是:
Row data_row.a data_row.b data_row.c json_row
1 1 2 5 {"a":"1","b":"2","c":"5"}
此查询的问题在于您必须指定所有文档(在这种情况下为a,b,c
)。
答案 2 :(得分:1)
以下是用于BigQuery标准SQL
#standardSQL
SELECT STRING_AGG(y, ', ' ORDER BY y) json
FROM (
SELECT STRING_AGG(TRIM(x) ORDER BY t DESC LIMIT 1) y
FROM `project.dataset.table`,
UNNEST(SPLIT(REGEXP_REPLACE(json, r'{|}', ''))) x
GROUP BY TRIM(SPLIT(x, ':')[OFFSET(0)])
)
注意:上面的解决方案通用且不需要不需要事先知道属性名称(例如a
,b
等),而是解析并提取将找到的内容。显然,它依赖于示例中的简单json的假设
您可以使用问题中的示例数据来测试,玩游戏,如下例所示
#standardSQL
WITH `project.dataset.table` AS (
SELECT '{"a":"1", "b":"2"}' json, 3 t UNION ALL
SELECT '{"b":"4"}', 2 UNION ALL
SELECT '{"a":"4", "c":"5"}', 1
)
SELECT STRING_AGG(y, ', ' ORDER BY y) json
FROM (
SELECT STRING_AGG(TRIM(x) ORDER BY t DESC LIMIT 1) y
FROM `project.dataset.table`,
UNNEST(SPLIT(REGEXP_REPLACE(json, r'{|}', ''))) x
GROUP BY TRIM(SPLIT(x, ':')[OFFSET(0)])
)
有结果
Row json
1 "a":"1", "b":"2", "c":"5"
因为(如上所述),它已经足够通用了-您可以添加具有更多属性的行,而无需更改代码,如下所示
#standardSQL
WITH `project.dataset.table` AS (
SELECT '{"a":"1", "b":"2"}' json, 3 t UNION ALL
SELECT '{"b":"4"}', 2 UNION ALL
SELECT '{"a":"4", "c":"5"}', 1 UNION ALL
SELECT '{"abc":"1", "xyz":"2"}', 3 UNION ALL
SELECT '{"abc":"3", "vwy":"4"}', 3
)
SELECT STRING_AGG(y, ', ' ORDER BY y) json
FROM (
SELECT STRING_AGG(TRIM(x) ORDER BY t DESC LIMIT 1) y
FROM `project.dataset.table`,
UNNEST(SPLIT(REGEXP_REPLACE(json, r'{|}', ''))) x
GROUP BY TRIM(SPLIT(x, ':')[OFFSET(0)])
)
有结果
Row json
1 "a":"1", "abc":"1", "b":"2", "c":"5", "vwy":"4", "xyz":"2"