Question

我正在查看BigQuery中的文本序列，并试图识别许多行上的单词补全（共享ID）。数据如下：

ID, Text
1, t
1, th
1, the
1, the
1, the c
1, the ca
1, the cat
1, the cat 
1, the cat s
...
1, the cat sat on the mat
2, r
...

对于每个给定的ID和序列，我正在尝试查找下一个单词边界。因此理想的输出为：

ID, Text, Boundary
1, t, the
1, th, the
1, the c, the cat
1, the ca, the cat
1, the cat s, the cat sat

在上面的下一个共享ID并以空格结尾的下一行给出了下一个（可以有多个）单词补全边界。

Answer 1

以下是用于BigQuery标准SQL

注意：这是蛮力的方法，因此查询可能不会那么优雅-希望这会给您一个良好的开端

#standardSQL
SELECT id, item, boundary
FROM (
  SELECT id, grp, 
    STRING_AGG(IF(boundary, text, ''), '') boundary,
    ARRAY_AGG(IF(NOT boundary, text, NULL) IGNORE NULLS ORDER BY LENGTH(text)) items
  FROM (
    SELECT id, text, 
      LENGTH(text) - LENGTH(REPLACE(text, ' ', '')) - IF(SUBSTR(text, -1) = ' ', 1, 0) grp,
      SUBSTR(text, -1) = ' ' boundary
    FROM `project.dataset.table`
  )
  GROUP BY id, grp
), UNNEST(items) item WITH OFFSET pos
WHERE RTRIM(item) != RTRIM(boundary)

如果要对以下问题中的伪数据进行应用

#standardSQL
WITH `project.dataset.table` AS (
  SELECT 1 id, 't' text UNION ALL
  SELECT 1, 'th' UNION ALL
  SELECT 1, 'the' UNION ALL
  SELECT 1, 'the ' UNION ALL
  SELECT 1, 'the c' UNION ALL
  SELECT 1, 'the ca' UNION ALL
  SELECT 1, 'the cat' UNION ALL
  SELECT 1, 'the cat ' UNION ALL
  SELECT 1, 'the cat s' UNION ALL
  SELECT 1, 'the cat sat ' 
)
SELECT id, item, boundary
FROM (
  SELECT id, grp, 
    STRING_AGG(IF(boundary, text, ''), '') boundary,
    ARRAY_AGG(IF(NOT boundary, text, NULL) IGNORE NULLS ORDER BY LENGTH(text)) items
  FROM (
    SELECT id, text, 
      LENGTH(text) - LENGTH(REPLACE(text, ' ', '')) - IF(SUBSTR(text, -1) = ' ', 1, 0) grp,
      SUBSTR(text, -1) = ' ' boundary
    FROM `project.dataset.table`
  )
  GROUP BY id, grp
), UNNEST(items) item WITH OFFSET pos
WHERE RTRIM(item) != RTRIM(boundary)
ORDER BY id, grp, pos

结果是

Row     id      item        boundary     
1       1       t           the  
2       1       th          the  
3       1       the c       the cat  
4       1       the ca      the cat  
5       1       the cat s   the cat sat

Answer 2

BigQuery UDF在这些情况下派上用场。这是一个可行的解决方案：

#standardSQL
/*boundary function*/
create temp function boundaryf (text string, sentence string) as (
  array_to_string(array(
    select q.w from unnest(
      array(select struct(w as w, row_number() over () as i)  from unnest(split(sentence, ' ')) w
      ) 
    ) q
    -- respect the ending space
    where q.i <= array_length(split(text, ' ')) - (length(text) - length(rtrim(text)))
  ), ' ')
);

WITH items AS (
  #--your data. assuming this is already ordered
  SELECT 1 as id, 't' as text UNION ALL
  SELECT 1, 'th' UNION ALL
  SELECT 1, 'the' UNION ALL
  SELECT 1, 'the ' UNION ALL
  SELECT 1, 'the c' UNION ALL
  SELECT 1, 'the ca' UNION ALL
  SELECT 1, 'the cat' UNION ALL
  SELECT 1, 'the cat ' UNION ALL
  SELECT 1, 'the cat s' UNION ALL
  SELECT 1, 'the cat sa' union all
  SELECT 1, 'the cat sat' union all
  SELECT 1, 'the cat sat ' union all
  SELECT 1, 'the cat sat o' union all 
  SELECT 1, 'the cat sat on' union all
  SELECT 1, 'the cat sat on ' union all
  SELECT 1, 'the cat sat on a' union all
  SELECT 1, 'the cat sat on a ' union all
  SELECT 1, 'the cat sat on a m' union all
  SELECT 1, 'the cat sat on a ma' union all
  SELECT 1, 'the cat sat on a mat' union all
  select 2, 'i' union all
  select 2, 'i a' union all
  select 2, 'i am' union all
  select 2, 'i am f' union all
  select 2, 'i am fr' union all
  select 2, 'i am fre' union all
  select 2, 'i am free'
),
sentences as (
  select id, sentences[offset (array_length(sentences)-1)] as sentence from (
    select id, array_agg(text) as sentences 
    from items group by 1
  )
),
control as (
  select i.id, i.text, boundaryf(i.text, s.sentence) as boundary
  from items i
  left join sentences s on s.id  = i.id
)
select * from control

bigquery：查找以下行匹配条件

2 个答案: