有一张非常大的桌子(超过2亿行)
sID int,wordID int(PK sID,wordID)
想要找到具有完全相同的wordID(并且没有额外内容)的sID 对于具有超过100个单词ID的sID,精确匹配的可能性下降,因此愿意将其限制为100 (但想去1000)
如果这是学校,sID是班级,而wordID是学生 然后我想找到有完全相同学生的班级。
sID,wordID
1,1
1,2
1,3
2,2
2,3
3,1
3,4
5,1
5,2
6,2
6,3
7,1
7,2
8,1
8,1
sID 6和2具有完全相同的wordID's sID 5,7和8具有完全相同的wordID
这是我到目前为止所做的事 我想删除两个删除#temp3_sID1_sID2并在上面的插入中处理这个 但我会尝试任何想法 您可以轻松创建一个包含2亿行的表来进行测试
drop table #temp_sID_wordCount
drop table #temp_count_wordID_sID
drop table #temp3_wordID_sID_forThatCount
drop table #temp3_sID1_sID2
drop table #temp3_sID1_sID2_keep
create table #temp_sID_wordCount (sID int primary key, ccount int not null)
create table #temp_count_wordID_sID (ccount int not null, wordID int not null, sID int not null, primary key (ccount, wordID, sID))
create table #temp3_wordID_sID_forThatCount (wordID int not null, sID int not null, primary key(wordID, sID))
create table #temp3_sID1_sID2_keep (sID1 int not null, sID2 int not null, primary key(sID1, sID2))
create table #temp3_sID1_sID2 (sID1 int not null, sID2 int not null, primary key(sID1, sID2))
insert into #temp_sID_wordCount
select sID, count(*) as ccount
FROM [FTSindexWordOnce] with (nolock)
group by sID
order by sID;
select count(*) from #temp_sID_wordCount where ccount <= 100; -- 701,966
truncate table #temp_count_wordID_sID
insert into #temp_count_wordID_sID
select #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID
from #temp_sID_wordCount
join [FTSindexWordOnce] with (nolock)
on [FTSindexWordOnce].sID = #temp_sID_wordCount.sID
and ccount >= 1 and ccount <= 10
order by #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID;
select count(*) from #temp_sID_wordCount; -- 34,860,090
truncate table #temp3_sID1_sID2_keep
declare cur cursor for
select top 10 ccount from #temp_count_wordID_sID group by ccount order by ccount
open cur
declare @count int, @sIDcur int
fetch next from cur into @count
while (@@FETCH_STATUS = 0)
begin
--print (@count)
--select count(*), @count from #temp_sID_wordCount where #temp_sID_wordCount.ccount = @count
truncate table #temp3_wordID_sID_forThatCount
truncate table #temp3_sID1_sID2
-- wordID and sID for that unique word count
-- they can only be exact if they have the same word count
insert into #temp3_wordID_sID_forThatCount
select #temp_count_wordID_sID.wordID
, #temp_count_wordID_sID.sID
from #temp_count_wordID_sID
where #temp_count_wordID_sID.ccount = @count
order by #temp_count_wordID_sID.wordID, #temp_count_wordID_sID.sID
-- select count(*) from #temp3_wordID_sID_forThatCount
-- this has some duplicates
-- sID1 is the group
insert into #temp3_sID1_sID2
select w1.sID, w2.sID
from #temp3_wordID_sID_forThatCount as w1 with (nolock)
join #temp3_wordID_sID_forThatCount as w2 with (nolock)
on w1.wordID = w2.wordID
and w1.sID <= w2.sID
group by w1.sID, w2.sID
having count(*) = @count
order by w1.sID, w2.sID
-- get rid of the goups of 1
delete #temp3_sID1_sID2
where sID1 in (select sID1 from #temp3_sID1_sID2 group by sID1 having count(*) = 1)
-- get rid of the double dips
delete #temp3_sID1_sID2
where #temp3_sID1_sID2.sID1 in
(select distinct s1del.sID1 -- these are the double dips
from #temp3_sID1_sID2 as s1base with (nolock)
join #temp3_sID1_sID2 as s1del with (nolock)
on s1del.sID1 > s1base.sID1
and s1Del.sID1 = s1base.sID2)
insert into #temp3_sID1_sID2_keep
select #temp3_sID1_sID2.sID1
, #temp3_sID1_sID2.sID2
from #temp3_sID1_sID2 with (nolock)
order by #temp3_sID1_sID2.sID1, #temp3_sID1_sID2.sID2
fetch next from cur into @count
end
close cur
deallocate cur
select *
FROM #temp3_sID1_sID2_keep with (nolock)
order by 1,2
答案 0 :(得分:1)
因此,正如我所见,任务是找到相同的子集。
首先,我们可以找到一对相等的子集:
;with tmp1 as (select sID, cnt = count(wordID) from [Table] group by sID)
select s1.sID, s2.sID
from tmp1 s1
cross join tmp1 s2
cross apply (
select count(1)
from [Table] d1
join [Table] d2 on d2.wordID = d1.wordID
where d1.sID = s1.sID and d2.sID = s2.sID
) c(cnt)
where s1.cnt = s2.cnt
and s1.sID > s2.sID
and s1.cnt = c.cnt
输出是:
sID sID
----------- -----------
6 2
7 5
8 5
8 7
然后,如果需要,可以将对组合成组:
sID gNum
----------- -----------
2 1
6 1
5 2
7 2
8 2
请参阅下面的SqlFiddle示例中的详细信息。
另一种方法是为每个子集数据计算哈希函数:
;with a as (
select distinct sID from [Table]
)
select sID,
hashbytes('sha1', (
select cast(wordID as varchar(10)) + '|'
from [Table]
where sID = a.sID
order by wordID
for xml path('')))
from a
然后可以根据哈希值对子集进行分组。
最后一个在我的机器上花了不到一分钟的时间来测试大约1000万行的测试数据(20k sID值,每个最多1k wordID)。您也可以通过将没有wordID计数匹配的sID排除在任何其他位置来优化它。