我有一个家庭关系数据库:
with example_data as(
SELECT 'abc' as relative_1, 'def' as relative_2
union all
SELECT 'abc' as relative_1, '123' as relative_2
union all
SELECT 'def' as relative_1, '334' as relative_2
union all
SELECT 'fdc' as relative_1, '123' as relative_2
union all
SELECT 'fgl' as relative_1, '342' as relative_2
)
如何基于这些数据创建完整的家族,以便获得以下结果:
我编写的用于创建所需输出的代码似乎根本不实际,实际上,对于一个有10万行输入的表,我的查询在第5个自联接之后达到了6小时的限制。 我不担心最终可能会将整个表连接成一个长串-我知道一个家族中只有这么多家庭成员。
此外,如果可以将结果返回为嵌套表,其中list_all_relatives
作为重复字段,并且在relative_1和相应的远距离亲戚之间步数最少,则效果会更好。
我的低效率代码会在图像中返回结果:
SELECT 'abc' as relative_1, 'def' as relative_2, 'abc' as list_0
union all
SELECT 'abc' as relative_1, '123' as relative_2, 'abc' as list_0
union all
SELECT 'def' as relative_1, '334' as relative_2, 'def' as list_0
union all
SELECT 'fdc' as relative_1, '123' as relative_2, 'fdc' as list_0
union all
SELECT 'fgl' as relative_1, '342' as relative_2, 'fgl' as list_0
)
,
step_0 as (
SELECT relative_1, relative_2,
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(relative_1,',',relative_2,',',list_0), ',')) AS x ORDER BY x), ',') AS combined_list
from raw_data
)
,
step_1 as (
SELECT relative_1, relative_2, list_1,
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(relative_1,',',relative_2,',',list_1), ',')) AS x ORDER BY x), ',') AS combined_list
from
step_0
left join
(select relative_2,combined_list as list_1 from step_0)
using(relative_2)
)
,
step_2 as (
SELECT distinct * except (combined_list,list_1),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_2), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_1
left join
(select relative_1,combined_list as list_2 from step_1)
using(relative_1)
)
,
step_3 as (
SELECT distinct * except (combined_list,list_2),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_3), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_2
left join
(select relative_2,combined_list as list_3 from step_2)
using(relative_2)
)
,
step_4 as (
SELECT distinct * except (combined_list,list_3),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_4), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_3
left join
(select relative_1,combined_list as list_4 from step_3)
using(relative_1)
),
step_N as (
SELECT *
from step_4
)
,
step_prefinal as (
SELECT distinct
relative_1,list_4, combined_list,
1+length(REGEXP_REPLACE(REGEXP_REPLACE(LOWER(combined_list), '[a-z]', ''),'[0-9]','')) as n_elements_in_list,
max(1+length(REGEXP_REPLACE(REGEXP_REPLACE(LOWER(combined_list), '[a-z]', ''),'[0-9]',''))) over (partition by relative_1) as longest_list_relatives
from step_N
)
,
step_final as (
SELECT relative_1, combined_list, n_elements_in_list,
count(*) over (partition by relative_1) as cnt_lists_per_relative,
max(n_elements_in_list) over (partition by relative_1) as max_elements
from
step_prefinal
where true
and longest_list_relatives = n_elements_in_list
group by 1,2,3
)
,
stats as (
SELECT cnt_lists_per_relative, count(distinct relative_1) as cnt,
max(max_elements) as max_elements
from
step_final
group by 1
order by 1
)
SELECT relative_1, combined_list as list_all_relatives
from step_final
where true
答案 0 :(得分:0)
我能够创建代码以更简单的方式重现您的输出。
我已使用提供的example_data重现它。我已经使用UDF和JavaScript实现了所需的输出。下面是代码:
#Custom UDF to return a list of strings from an array
CREATE TEMP FUNCTION
rel (relatives ARRAY<String>)
RETURNS string
LANGUAGE js AS '''
return relatives;
''';
#provided data
WITH example_data AS(
SELECT 'abc' AS relative_1, 'def' AS relative_2
UNION ALL
SELECT 'abc' AS relative_1, '123' AS relative_2
UNION ALL
SELECT 'def' AS relative_1, '334' AS relative_2
UNION ALL
SELECT 'fdc' AS relative_1, '123' AS relative_2
UNION ALL
SELECT 'fgl' AS relative_1, '342' AS relative_2
),
#Manipulating the data
data AS
(
SELECT
t2.key,
ARRAY_AGG(CAST(relative_2 AS string) ) AS relatives
FROM example_data t1
LEFT JOIN ( SELECT DISTINCT relative_1 AS key FROM example_data
GROUP BY relative_1) t2
ON key=relative_1
GROUP BY 1
ORDER BY key
)
#selecting the desired fields and using the UDF
SELECT key, rel(relatives) AS list_of_relatives FROM data
ORDER BY key
如您所见,第一步是创建一个UDF,它接收Strings的嵌套字段,并为每个“ Key ”简单地在字符串列表中返回它们。声明 example_data 后,在步骤2 中,必须执行一些数据操作。为了实现如下表:
如您所见,该表在嵌套列中具有 relative_1 ( 命名为键 )和亲戚。
然后,选择所需的字段。这意味着键和 UDF 为 list_of_relatives ,它们是在第一步中编写的,输出为下方:
最后,请注意, list_of_relatives 不再是嵌套字段。而是一个 String ,每个值都用逗号分隔。如下所示: