找到确切的FK匹配

时间:2013-10-24 16:11:36

标签: tsql sql-server-2008-r2

有一张非常大的桌子(超过2亿行)
sID int,wordID int(PK sID,wordID)

想要找到具有完全相同的wordID(并且没有额外内容)的sID 对于具有超过100个单词ID的sID,精确匹配的可能性下降,因此愿意将其限制为100 (但想去1000)

如果这是学校,sID是班级,而wordID是学生 然后我想找到有完全相同学生的班级。

sID,wordID
1,1 1,2 1,3 2,2 2,3 3,1 3,4 5,1 5,2 6,2 6,3 7,1 7,2 8,1 8,1

sID 6和2具有完全相同的wordID's sID 5,7和8具有完全相同的wordID

这是我到目前为止所做的事 我想删除两个删除#temp3_sID1_sID2并在上面的插入中处理这个 但我会尝试任何想法 您可以轻松创建一个包含2亿行的表来进行测试

  drop table #temp_sID_wordCount
  drop table #temp_count_wordID_sID 
  drop table #temp3_wordID_sID_forThatCount
  drop table #temp3_sID1_sID2
  drop table #temp3_sID1_sID2_keep
  create table #temp_sID_wordCount  (sID int primary key, ccount int not null)
  create table #temp_count_wordID_sID  (ccount int not null, wordID int not null, sID int not null, primary key (ccount, wordID, sID)) 
  create table #temp3_wordID_sID_forThatCount  (wordID int not null, sID int not null, primary key(wordID, sID))
  create table #temp3_sID1_sID2_keep  (sID1 int not null, sID2 int not null, primary key(sID1, sID2))
  create table #temp3_sID1_sID2  (sID1 int not null, sID2 int not null, primary key(sID1, sID2))
  insert into #temp_sID_wordCount 
  select sID, count(*) as ccount 
   FROM [FTSindexWordOnce] with (nolock)
   group by sID 
   order by sID;
  select count(*) from #temp_sID_wordCount where ccount <= 100;  -- 701,966
  truncate table #temp_count_wordID_sID
  insert into #temp_count_wordID_sID 
  select #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID 
    from #temp_sID_wordCount
    join [FTSindexWordOnce] with (nolock) 
      on [FTSindexWordOnce].sID = #temp_sID_wordCount.sID
     and ccount >= 1 and ccount <= 10
   order by #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID;
  select count(*) from #temp_sID_wordCount;  -- 34,860,090

    truncate table #temp3_sID1_sID2_keep
    declare cur cursor for 
    select top 10 ccount from #temp_count_wordID_sID group by ccount order by ccount

    open cur
    declare @count int, @sIDcur int
    fetch next from cur into @count
    while (@@FETCH_STATUS = 0)
    begin
      --print (@count)
      --select count(*), @count from #temp_sID_wordCount where #temp_sID_wordCount.ccount = @count
      truncate table #temp3_wordID_sID_forThatCount
      truncate table #temp3_sID1_sID2

      -- wordID and sID for that unique word count 
      -- they can only be exact if they have the same word count
      insert into #temp3_wordID_sID_forThatCount 
      select       #temp_count_wordID_sID.wordID
                 , #temp_count_wordID_sID.sID
      from #temp_count_wordID_sID
      where #temp_count_wordID_sID.ccount = @count
      order by  #temp_count_wordID_sID.wordID, #temp_count_wordID_sID.sID 

      -- select count(*) from  #temp3_wordID_sID_forThatCount

      -- this has some duplicates 
      -- sID1 is the group 
      insert into #temp3_sID1_sID2
      select w1.sID, w2.sID
        from #temp3_wordID_sID_forThatCount as w1 with (nolock)
        join #temp3_wordID_sID_forThatCount as w2 with (nolock)
          on w1.wordID = w2.wordID
         and w1.sID <= w2.sID         
       group by w1.sID, w2.sID
       having count(*) = @count
       order by w1.sID, w2.sID

      -- get rid of the goups of 1      
      delete #temp3_sID1_sID2  
      where  sID1 in (select sID1 from #temp3_sID1_sID2 group by sID1 having count(*) = 1)

      -- get rid of the double dips         
      delete #temp3_sID1_sID2
       where #temp3_sID1_sID2.sID1 in 
              (select distinct s1del.sID1 -- these are the double dips 
                from #temp3_sID1_sID2 as s1base with (nolock) 
                join #temp3_sID1_sID2 as s1del with (nolock)
                  on s1del.sID1 > s1base.sID1 
                 and s1Del.sID1 = s1base.sID2)

      insert into #temp3_sID1_sID2_keep      
      select #temp3_sID1_sID2.sID1
           , #temp3_sID1_sID2.sID2
        from #temp3_sID1_sID2 with (nolock)
        order by #temp3_sID1_sID2.sID1, #temp3_sID1_sID2.sID2

    fetch next from cur into  @count
    end
    close cur
    deallocate cur

 select *
 FROM #temp3_sID1_sID2_keep  with (nolock)
 order by 1,2

1 个答案:

答案 0 :(得分:1)

因此,正如我所见,任务是找到相同的子集。

首先,我们可以找到一对相等的子集:

;with tmp1 as (select sID, cnt = count(wordID) from [Table] group by sID)
select s1.sID, s2.sID
from tmp1 s1
    cross join tmp1 s2
    cross apply (
        select count(1)
        from [Table] d1
            join [Table] d2 on d2.wordID = d1.wordID
        where d1.sID = s1.sID and d2.sID = s2.sID
    ) c(cnt)
where s1.cnt = s2.cnt
    and s1.sID > s2.sID
    and s1.cnt = c.cnt

输出是:

sID        sID
----------- -----------
6           2
7           5
8           5
8           7

然后,如果需要,可以将对组合成组:

sID         gNum
----------- -----------
2           1
6           1
5           2
7           2
8           2

请参阅下面的SqlFiddle示例中的详细信息。

SqlFiddle Sample


另一种方法是为每个子集数据计算哈希函数:

;with a as (
    select distinct sID from [Table]
)
select sID,
    hashbytes('sha1', (
        select cast(wordID as varchar(10)) + '|'
        from [Table]
        where sID = a.sID
        order by wordID
        for xml path('')))
from a

然后可以根据哈希值对子集进行分组。

SqlFiddle Sample

最后一个在我的机器上花了不到一分钟的时间来测试大约1000万行的测试数据(20k sID值,每个最多1k wordID)。您也可以通过将没有wordID计数匹配的sID排除在任何其他位置来优化它。