以有效的方式从数组元素获取所有可能的组合

时间:2018-07-24 09:30:43

标签: sql postgresql combinations postgresql-9.6

其中包含最多5个元素的数组,例如给定:{1,2,3,4,5},我需要从该数组中获取所有可能的唯一组合,预期结果是:

 {1}
 {1,2}
 {1,2,3}
 {1,2,3,4}
 {1,2,3,4,5}
 {1,2,3,5}
 {1,2,4}
 {1,2,4,5}
 {1,2,5}
 {1,3}
 {1,3,4}
 {1,3,4,5}
 {1,3,5}
 {1,4}
 {1,4,5}
 {1,5}
 {2}
 {2,3}
 {2,3,4}
 {2,3,4,5}
 {2,3,5}
 {2,4}
 {2,4,5}
 {2,5}
 {3}
 {3,4}
 {3,4,5}
 {3,5}
 {4}
 {4,5}
 {5}

我有此解决方案:

create table temp_all_possible_cards (
    card_ids int[]
);

create or replace function test(cards_in_hands INT[] )
returns void
as $$
begin 
    with all_possible_cards(ids) as(
        select ARRAY_APPEND('{}'::int[], t1.card_ids)||ARRAY_APPEND('{}'::int[], t2.card_ids)||ARRAY_APPEND('{}'::int[], t3.card_ids)||ARRAY_APPEND('{}'::int[], t4.card_ids)||ARRAY_APPEND('{}'::int[], t5.card_ids)
        from (
            select unnest(cards_in_hands) as card_ids
        ) t1
        cross join (
            select unnest(cards_in_hands) as card_ids
        ) t2
        cross join (
            select unnest(cards_in_hands) as card_ids
        ) t3
        cross join (
            select unnest(cards_in_hands) as card_ids
        ) t4
        cross join (
            select unnest(cards_in_hands) as card_ids
        ) t5
    )
    INSERT INTO temp_all_possible_cards
    SELECT DISTINCT uniq( sort(ids) ) from all_possible_cards;  
end;
$$ language plpgsql

这有效,但是有一个大问题,有时我需要运行此功能5000次

do $$
begin
    for i in 1..5000 loop
        perform test('{1,2,3,4,5}');
    end loop;
end;
$$ language plpgsql

,循环执行时间为55-60秒。

问题:如何以有效的方式从阵列中获得所有可能的独特组合?如何优化该解决方案,以至于甚至5000次呼叫都比60秒快得多?

3 个答案:

答案 0 :(得分:1)

;WITH NOS AS (SELECT 1 aval 
             UNION ALL
             SELECT aval + 1 FROM NOS WHERE aval < 2 * 2 * 2 * 2 * 2 - 1
             )
    SELECT LEFT(IQ.x, LEN(IQ.x) - 1) + '}' FROM (
            SELECT RTRIM('{' 
                                + CASE WHEN aval & 1 != 0 THEN '1, ' ELSE '' END        
                                + CASE WHEN aval & 2 != 0 THEN '2, ' ELSE '' END        
                                + CASE WHEN aval & 4 != 0 THEN '3, ' ELSE '' END        
                                + CASE WHEN aval & 8 != 0 THEN '4, ' ELSE '' END        
                                + CASE WHEN aval & 16 != 0 THEN '5, ' ELSE '' END) AS X     
                        FROM NOS) IQ

to show a benchmark

create table #test (x nvarchar(50))

declare @i int = 0; 

    declare @s datetime2 = sysutcdatetime();

    while @i < 5000
    begin

    ;WITH NOS AS (SELECT 1 aval 
                 UNION ALL
                 SELECT aval + 1 FROM NOS WHERE aval < 2 * 2 * 2 * 2 * 2 - 1
                 )
        insert #test SELECT LEFT(IQ.x, LEN(IQ.x) - 1) + '}' FROM (
                SELECT RTRIM('{' 
                                    + CASE WHEN aval & 1 != 0 THEN '1, ' ELSE '' END        
                                    + CASE WHEN aval & 2 != 0 THEN '2, ' ELSE '' END        
                                    + CASE WHEN aval & 4 != 0 THEN '3, ' ELSE '' END        
                                    + CASE WHEN aval & 8 != 0 THEN '4, ' ELSE '' END        
                                    + CASE WHEN aval & 16 != 0 THEN '5, ' ELSE '' END) AS X     
                            FROM NOS) IQ


        set @i = @i + 1;

        end 

        DECLARe @usTiming BIGINT = datediff(MICROSECOND, @s ,sysutcdatetime())

        select CAST(@usTiming as nvarchar(19)) + 'us = ' + CAST(CAST(@usTiming/1000000.000000000000 as dec(10,3)) as nvarchar(20)) + ' seconds';

        drop table #test

I got 2.5 seconds

答案 1 :(得分:1)

plpython函数中使用itertools

create or replace function generate_combinations(cards_in_hand int[])
returns void language plpython3u as $$

import itertools

plan = plpy.prepare("insert into temp_all_possible_cards values ($1)", ["int[]"])

for r in range(1, len(cards_in_hand) + 1):
    for i in itertools.combinations(cards_in_hand, r):
        plpy.execute(plan, [i])
$$;

这应该比plpgsql函数快几倍。

答案 2 :(得分:0)

此解决方案需要大约1.5秒才能处理5000个整数数组(我的测试数据表包含5000个不同的数组,长度在1到5之间,每个元素值在1到150之间)

这个想法是生成从1到length(array)的所有可能的二进制值。然后,我只需要从对应的二进制字符串中存在1的数组中取出元素即可。

A获取测试数据

B生成二进制值。因为bin(5)强制转换没有可变长度,所以我需要使用子字符串函数将位值长度标准化为数组长度。

C在对应的数组值旁边取消嵌套二进制值。现在,我可以过滤掉组合结果中不应该包含的数组元素。

D重新聚合过滤的元素。

SELECT 
    orig, 
    array_agg(single_char) -- D
FROM ( 
    SELECT *
    FROM (
        SELECT
            no,
            orig,
            unnest(regexp_split_to_array(no, '')) ::BOOLEAN AS to_set, -- C
            unnest(orig)         AS single_char
        FROM (
            SELECT
                substring(generate_series(1,(2^len_a - 1) ::INTEGER)::bit(5)::text, 5 - len_a + 1, 5) AS no, --B
                s.a AS orig
            FROM (                   
                SELECT 
                    s.a, 
                    array_length(s.a, 1) as len_a 
                FROM (
                    SELECT arrays::int[] a FROM testdata.arrays ORDER BY a -- A
                ) s               
            ) s
        ) s
    ) s
    WHERE
        to_set
    AND single_char IS NOT NULL
) s
GROUP BY orig, no

示例:

表测试数据包含

{1}
{1,1,118}
{1,2,70,142,11}

查询结果为

orig                  array_agg             
--------------------  --------------------  
{1}                   {1}                   
{1,1,118}             {118}                 
{1,1,118}             {1}                   
{1,1,118}             {1,118}               
{1,1,118}             {1}                   
{1,1,118}             {118,1}               
{1,1,118}             {1,1}                 
{1,1,118}             {1,1,118}             
{1,2,70,142,11}       {11}                  
{1,2,70,142,11}       {142}                 
{1,2,70,142,11}       {142,11}              
{1,2,70,142,11}       {70}                  
{1,2,70,142,11}       {70,11}               
{1,2,70,142,11}       {70,142}              
{1,2,70,142,11}       {70,142,11}           
{1,2,70,142,11}       {2}                   
{1,2,70,142,11}       {2,11}                
{1,2,70,142,11}       {2,142}               
{1,2,70,142,11}       {142,11,2}            
{1,2,70,142,11}       {2,70}                
{1,2,70,142,11}       {2,70,11}             
{1,2,70,142,11}       {2,70,142}            
{1,2,70,142,11}       {2,70,142,11}         
{1,2,70,142,11}       {1}                   
{1,2,70,142,11}       {1,11}                
{1,2,70,142,11}       {1,142}               
{1,2,70,142,11}       {1,142,11}            
{1,2,70,142,11}       {1,70}                
{1,2,70,142,11}       {1,70,11}             
{1,2,70,142,11}       {1,70,142}            
{1,2,70,142,11}       {142,11,1,70}         
{1,2,70,142,11}       {1,2}                 
{1,2,70,142,11}       {1,2,11}              
{1,2,70,142,11}       {1,2,142}             
{1,2,70,142,11}       {1,2,142,11}          
{1,2,70,142,11}       {1,2,70}              
{1,2,70,142,11}       {1,2,70,11}           
{1,2,70,142,11}       {1,2,70,142}          
{1,2,70,142,11}       {1,2,70,142,11}

编辑:由于默认情况下array_agg不保证任何顺序,因此结果数组包含正确的元素,但不一定按原始顺序。

也许这也可能受到挑战。因此,我添加了row_number()窗口函数作为C和D之间的位置指示器:

SELECT *, row_number() OVER (partition by orig, no) as position

现在,我可以通过执行array_agg(single_char ORDER BY position)来对数组进行排序。但是订购非常便宜,而且还要花第二秒钟。