我正在查看BigQuery中的文本序列,并试图识别许多行上的单词补全(共享ID)。数据如下:
ID, Text
1, t
1, th
1, the
1, the
1, the c
1, the ca
1, the cat
1, the cat
1, the cat s
...
1, the cat sat on the mat
2, r
...
对于每个给定的ID和序列,我正在尝试查找下一个单词边界。因此理想的输出为:
ID, Text, Boundary
1, t, the
1, th, the
1, the c, the cat
1, the ca, the cat
1, the cat s, the cat sat
在上面的下一个共享ID并以空格结尾的下一行给出了下一个(可以有多个)单词补全边界。
答案 0 :(得分:3)
以下是用于BigQuery标准SQL
注意:这是蛮力的方法,因此查询可能不会那么优雅-希望这会给您一个良好的开端
#standardSQL
SELECT id, item, boundary
FROM (
SELECT id, grp,
STRING_AGG(IF(boundary, text, ''), '') boundary,
ARRAY_AGG(IF(NOT boundary, text, NULL) IGNORE NULLS ORDER BY LENGTH(text)) items
FROM (
SELECT id, text,
LENGTH(text) - LENGTH(REPLACE(text, ' ', '')) - IF(SUBSTR(text, -1) = ' ', 1, 0) grp,
SUBSTR(text, -1) = ' ' boundary
FROM `project.dataset.table`
)
GROUP BY id, grp
), UNNEST(items) item WITH OFFSET pos
WHERE RTRIM(item) != RTRIM(boundary)
如果要对以下问题中的伪数据进行应用
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, 't' text UNION ALL
SELECT 1, 'th' UNION ALL
SELECT 1, 'the' UNION ALL
SELECT 1, 'the ' UNION ALL
SELECT 1, 'the c' UNION ALL
SELECT 1, 'the ca' UNION ALL
SELECT 1, 'the cat' UNION ALL
SELECT 1, 'the cat ' UNION ALL
SELECT 1, 'the cat s' UNION ALL
SELECT 1, 'the cat sat '
)
SELECT id, item, boundary
FROM (
SELECT id, grp,
STRING_AGG(IF(boundary, text, ''), '') boundary,
ARRAY_AGG(IF(NOT boundary, text, NULL) IGNORE NULLS ORDER BY LENGTH(text)) items
FROM (
SELECT id, text,
LENGTH(text) - LENGTH(REPLACE(text, ' ', '')) - IF(SUBSTR(text, -1) = ' ', 1, 0) grp,
SUBSTR(text, -1) = ' ' boundary
FROM `project.dataset.table`
)
GROUP BY id, grp
), UNNEST(items) item WITH OFFSET pos
WHERE RTRIM(item) != RTRIM(boundary)
ORDER BY id, grp, pos
结果是
Row id item boundary
1 1 t the
2 1 th the
3 1 the c the cat
4 1 the ca the cat
5 1 the cat s the cat sat
答案 1 :(得分:0)
BigQuery UDF在这些情况下派上用场。这是一个可行的解决方案:
#standardSQL
/*boundary function*/
create temp function boundaryf (text string, sentence string) as (
array_to_string(array(
select q.w from unnest(
array(select struct(w as w, row_number() over () as i) from unnest(split(sentence, ' ')) w
)
) q
-- respect the ending space
where q.i <= array_length(split(text, ' ')) - (length(text) - length(rtrim(text)))
), ' ')
);
WITH items AS (
#--your data. assuming this is already ordered
SELECT 1 as id, 't' as text UNION ALL
SELECT 1, 'th' UNION ALL
SELECT 1, 'the' UNION ALL
SELECT 1, 'the ' UNION ALL
SELECT 1, 'the c' UNION ALL
SELECT 1, 'the ca' UNION ALL
SELECT 1, 'the cat' UNION ALL
SELECT 1, 'the cat ' UNION ALL
SELECT 1, 'the cat s' UNION ALL
SELECT 1, 'the cat sa' union all
SELECT 1, 'the cat sat' union all
SELECT 1, 'the cat sat ' union all
SELECT 1, 'the cat sat o' union all
SELECT 1, 'the cat sat on' union all
SELECT 1, 'the cat sat on ' union all
SELECT 1, 'the cat sat on a' union all
SELECT 1, 'the cat sat on a ' union all
SELECT 1, 'the cat sat on a m' union all
SELECT 1, 'the cat sat on a ma' union all
SELECT 1, 'the cat sat on a mat' union all
select 2, 'i' union all
select 2, 'i a' union all
select 2, 'i am' union all
select 2, 'i am f' union all
select 2, 'i am fr' union all
select 2, 'i am fre' union all
select 2, 'i am free'
),
sentences as (
select id, sentences[offset (array_length(sentences)-1)] as sentence from (
select id, array_agg(text) as sentences
from items group by 1
)
),
control as (
select i.id, i.text, boundaryf(i.text, s.sentence) as boundary
from items i
left join sentences s on s.id = i.id
)
select * from control