sql bigquery:从嵌套字段中提取“:”之后的字符串

时间:2019-07-27 15:54:12

标签: sql regex google-bigquery

我正在尝试从Google BigQuery的嵌套字段中提取“:”之后的所有值,并将它们添加到单独的列中。 但是,gets返回为空。

我认为问题在于所使用的正则表达式r'^:(。*?)'

SELECT
  ARRAY(
    SELECT regexp_extract(x, r'^(.*?)\:')
    FROM UNNEST(split(GCAM,',')) AS x
    WHERE regexp_extract(x, r'^(.*?)\:') IS NOT NULL
  ) AS GCAM_field,
   ARRAY(
    SELECT regexp_extract(x, r'^\:(.*?)')
    FROM UNNEST(split(GCAM,',')) AS x
    WHERE regexp_extract(x, r'^\:(.*?)') IS NOT NULL
  ) AS GCAM_value
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-02-02')

预期结果是:列“ GCAM_field”的所有值都在“:”之前,列“ GCAM_value”的所有值都在“:”之后。但是,后者将返回为空。

2 个答案:

答案 0 :(得分:0)

#standardSQL
SELECT
  ARRAY(
    SELECT REGEXP_EXTRACT(x, r'^(.*?):')
    FROM UNNEST(SPLIT(GCAM,',')) AS x
    WHERE REGEXP_EXTRACT(x, r'^(.*?):') IS NOT NULL
  ) AS GCAM_field,
   ARRAY(
    SELECT REGEXP_EXTRACT(x, r':(.*?)$')
    FROM UNNEST(SPLIT(GCAM,',')) AS x
    WHERE REGEXP_EXTRACT(x, r':(.*?)$') IS NOT NULL
  ) AS GCAM_value
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-02-02')  

或更简单:

#standardSQL
SELECT
  ARRAY(
    SELECT SPLIT(x, ':')[SAFE_OFFSET(0)]
    FROM UNNEST(SPLIT(GCAM,',')) AS x
    WHERE SPLIT(x, ':')[SAFE_OFFSET(0)] IS NOT NULL
  ) AS GCAM_field,
   ARRAY(
    SELECT SPLIT(x, ':')[SAFE_OFFSET(1)]
    FROM UNNEST(SPLIT(GCAM,',')) AS x
    WHERE SPLIT(x, ':')[SAFE_OFFSET(1)] IS NOT NULL
  ) AS GCAM_value
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-02-02')

答案 1 :(得分:0)

解决此问题的另一种方法

with

sample_data as (
    select
        *
    from
        unnest(
            array[
                struct(1 as id, 'bbb:111,aaa:222' as gcam),
                struct(2 as id, 'qqq:,k:3,:777,xxx:555:&&&' as gcam)
            ]
        )
)

select
    regexp_extract_all(
        concat(',', gcam, ','), r',(.*?)\:') as gcam_field,
    regexp_extract_all(
        concat(',', gcam, ','), r'\:(.*?),') as gcam_value
from
    sample_data