我正在尝试从Google BigQuery的嵌套字段中提取“:”之后的所有值,并将它们添加到单独的列中。 但是,gets返回为空。
我认为问题在于所使用的正则表达式r'^:(。*?)'
SELECT
ARRAY(
SELECT regexp_extract(x, r'^(.*?)\:')
FROM UNNEST(split(GCAM,',')) AS x
WHERE regexp_extract(x, r'^(.*?)\:') IS NOT NULL
) AS GCAM_field,
ARRAY(
SELECT regexp_extract(x, r'^\:(.*?)')
FROM UNNEST(split(GCAM,',')) AS x
WHERE regexp_extract(x, r'^\:(.*?)') IS NOT NULL
) AS GCAM_value
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-02-02')
预期结果是:列“ GCAM_field”的所有值都在“:”之前,列“ GCAM_value”的所有值都在“:”之后。但是,后者将返回为空。
答案 0 :(得分:0)
#standardSQL
SELECT
ARRAY(
SELECT REGEXP_EXTRACT(x, r'^(.*?):')
FROM UNNEST(SPLIT(GCAM,',')) AS x
WHERE REGEXP_EXTRACT(x, r'^(.*?):') IS NOT NULL
) AS GCAM_field,
ARRAY(
SELECT REGEXP_EXTRACT(x, r':(.*?)$')
FROM UNNEST(SPLIT(GCAM,',')) AS x
WHERE REGEXP_EXTRACT(x, r':(.*?)$') IS NOT NULL
) AS GCAM_value
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-02-02')
或更简单:
#standardSQL
SELECT
ARRAY(
SELECT SPLIT(x, ':')[SAFE_OFFSET(0)]
FROM UNNEST(SPLIT(GCAM,',')) AS x
WHERE SPLIT(x, ':')[SAFE_OFFSET(0)] IS NOT NULL
) AS GCAM_field,
ARRAY(
SELECT SPLIT(x, ':')[SAFE_OFFSET(1)]
FROM UNNEST(SPLIT(GCAM,',')) AS x
WHERE SPLIT(x, ':')[SAFE_OFFSET(1)] IS NOT NULL
) AS GCAM_value
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-02-02')
答案 1 :(得分:0)
解决此问题的另一种方法
with
sample_data as (
select
*
from
unnest(
array[
struct(1 as id, 'bbb:111,aaa:222' as gcam),
struct(2 as id, 'qqq:,k:3,:777,xxx:555:&&&' as gcam)
]
)
)
select
regexp_extract_all(
concat(',', gcam, ','), r',(.*?)\:') as gcam_field,
regexp_extract_all(
concat(',', gcam, ','), r'\:(.*?),') as gcam_value
from
sample_data