使用SQL实现近似不相交集的最佳方法是什么?
详情
我有一个边缘表,存储为[vertex_a, vertex_b]
的两列表。
我需要一个不同集合的表,存储为[vertex, set_id]
,每个顶点有一行,用不相交的set_id标记每个顶点。
约束
相关
答案 0 :(得分:0)
我实际上正在解决同样的问题。不幸的是,我不认为可以找到一个非常有效的解决方案 - 至少不能轻易使用SQL。只需删除' distinct'以及自我消除查询,以观察工作集的大小。也就是说,以下解决方案将起作用。
drop table if exists foobar;
drop function if exists addset(v int[], a int);
/* our vertices table */
create table foobar (
src int,
dst int
);
/* Create a small function to treat an array like a set,
not strictly necessary but convenient */
create function addset(v int[], a int) returns int[]
as $$
begin
return (select array_agg(e order by e)
from (select unnest(v || a) as e) f);
end
$$ language plpgsql;
/* fill our table with vertices, note the ordering of each value */
insert into foobar (src, dst)
values (1,2), (1,3), (2,3), (3,4), (4,5), (6,7), (6,8);
/* use a recursive query to extend the sets */
with recursive foo_union (v) as (
select array[src, dst] as v from foobar /* starter sets */
union all
/* join self to original array; i can use a CTE as a 'starter',
but that is not necessary here */
select addset(v, dst) as v from foo_union u, foobar f
where src = any(v) and not dst = any(v)
) select distinct v from foo_union a where not exists (
/* eliminate the many overlapping results */
select * from foo_union b where b.v @> a.v and b.v != a.v
);
但同样,这对大型数据集来说是完全不切实际的;任何其他解决方案都需要迭代。
答案 1 :(得分:0)
这个纯粹的sql代码在5分钟内分组了大约35000条记录(8个核心/ 32 GB内存)。享受。
--table with RELATIONS, idea is to place every related item in a bucket
create table RELATIONS
(
bucket int, -- initially 0
bucketsub int, -- initially 0
relnr1 float,
relnr2 float -- relnr1 = a, relnr2 = b means a and b are related
)
create table ##BucketRelnrs ( relnr float ); --table functions as temp list
declare @bucket int;
declare @bucketsub int;
declare @nrOfUpdates int;
declare @relnr float;
set @bucket=0;
set @relnr=0;
WHILE @relnr>=0 and @bucket<50000 --to prevent the while loop from hanging.
BEGIN
set @bucket = @bucket+1
set @bucketsub=1;
set @relnr = (select isnull(min(relnr1),-1) from RELATIONS where bucket=0); --pick the smallest relnr that has not been assigned a bucket yet
set @nrOfUpdates = (select count(*) from RELATIONS where bucket=0 and (relnr1=@relnr or relnr2=@relnr));
update RELATIONS set bucket=@bucket, bucketsub=@bucketsub where bucket=0 and (relnr1=@relnr or relnr2=@relnr);
set @bucketsub = @bucketsub+1;
WHILE @nrOfUpdates>0 and @bucketsub<=10 --to prevent the inner while loop from hanging, actually determines the number of iterations
BEGIN
--refill temp list with newly found related relnrs
truncate table ##BucketRelnrs;
insert into ##BucketRelnrs
select distinct relnr1 from RELATIONS where bucket=@bucket
union select distinct relnr2 from RELATIONS where bucket=@bucket;
--calculate the number of relations that will be updated next, if zero then stop iteration
set @nrOfUpdates =
(
select count(*) from RELATIONS where bucket=0
and (relnr1 in (select relnr from ##BucketRelnrs) or relnr2 in (select relnr from ##BucketRelnrs))
);
--update the RELATIONS table
update RELATIONS set bucket=@bucket, bucketsub=@bucketsub where bucket=0
and (relnr1 in (select relnr from ##BucketRelnrs) or relnr2 in (select relnr from ##BucketRelnrs));
set @bucketsub = @bucketsub+1;
END
END
drop table ##BucketRelnrs; --clean temp table