优化大型表的while循环在SQL Server 2008/12中

时间:2015-10-01 21:24:04

标签: sql-server sql-server-2008 tsql ssis query-performance

表名[dbo].[SourceData]有1900万行。

我正在针对此表运行while循环,并根据匹配条件将数据加载到另一个表中。虽然循环比以往任何时候都要长。

示例代码如下。 Sourcedata表格有seqno,这是唯一的identity列(主键)。 Firstname,lastname,address,emailaddress也有单独的NC索引。

create table #holdscore
(
     seqno bigint, 
     associatedseq bigint, 
     scrore int, 
     status varchar(20),
     customerid varchar(30)

     CONSTRAINT [PK_SourceScores] 
         PRIMARY KEY CLUSTERED (seqno ASC, associatedseq ASC) 
) 

Create table #loop 
(
     seqno bigint primary key clustered, 
     Flag varchar(1) NULL
)

Insert #loop (seqno)
   select distinct TOP 1000 seqno     
   from [dbo].[SourceData] 
   order by seqno

Declare @seqno bigint
Declare @firstname Nvarchar(100)
Declare @lastname Nvarchar(100)
Declare @phonenum nvarchar(100)
Declare @emailadd Nvarchar(100)
Declare @Address Nvarchar(250)
Declare @MiddleName nvarchar(50)
Declare @CCExpYYMM nvarchar(4)
Declare @CCLastFour nvarchar(4)

While ((select count(*) from  #Loop where flag is null)>0)
Begin 
    Select top 1 @seqno = seqno from #Loop  where flag is null

    Select @firstname = [FirstName],
           @lastname = [LastName],
           @phonenum = [PhoneNorm],
           @emailadd = [EmailAddress],
           @Address = [AddressNorm],
           @MiddleName = [MiddleName],
           @CCExpYYMM  = [CCExpYYMM],
           @CCLastFour = [CCLastFour]
     from  [dbo].[SourceData]
     where seqno = @seqno

     INSERT #holdscore
         select 
             orginalseqno, associatedseq, score, 
             case when score >= 80 Then 'Match'
                  when score < 80 Then 'Review' 
             end as Status,  
             customerid 
         from
             (select 
                  @seqno orginalseqno, seqno as associatedseq,
                  customerid,
                  case
                      when [FirstName] = @firstname 
                       and [LastName] = @lastname 
                       and [PhoneNorm] = @phonenum 
                       and [EmailAddress] = @emailadd
                       and [AddressNorm] = @Address 
                       and [MiddleName] = @MiddleName 
                       and [CCExpYYMM]  = @CCExpYYMM  
                       and [CCLastFour] = @CCLastFour THEN '100'

                     when [FirstName] = @firstname 
                      and [LastName] = @lastname 
                      and [PhoneNorm] = @phonenum 
                      and [EmailAddress] = @emailadd
                      and [AddressNorm] = @Address 
                      and [MiddleName] = @MiddleName 
                      and [CCExpYYMM]  = @CCExpYYMM THEN '99'

                    when [FirstName] = @firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
        and [AddressNorm] = @Address and [MiddleName] = @MiddleName and [CCLastFour] = @CCLastFour THEN '99'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
        and [AddressNorm] = @Address and [MiddleName] = @MiddleName                             Then '98'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
        and [AddressNorm] = @Address                                                            Then '93'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd  Then '83'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum                               Then '68'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd                            Then '63'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [AddressNorm] = @Address  Then '78'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd and [AddressNorm] = @Address  Then '73'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [AddressNorm] = @Address                               Then '58'
    WHEN [FirstName]=@firstname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName  Then '73'
    WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName  THEN '75'
    WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address                                 Then '70'
    WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd                                                              THEN '60' 
    END AS Score
  From   [dbo].[SourceData]

  )A 
  where A.Score is not null
  OPTION (MAXDOP 8)

Update #Loop
set Flag = 'Y'
where seqno =@seqno and Flag is null


end

对于1000个独特的seqno,完成需要1个多小时。我需要将1900万行相互比较并将其加载到表中。请帮助我加快这个过程。这样我就可以及时加载数据。 SSIS也将起作用。

2 个答案:

答案 0 :(得分:0)

构建于此

select s1.seqno as orginalseqno, s2,seqno as associatedseq, 100, 'Match', s2.customerid
 from [SourceData] as s1
 join [SourceData] as s2
   on s2.[FirstName]    = s1.firstname 
  and s2.[LastName]     = s1.lastname 
  and s2.[PhoneNorm]    = s1.phonenum 
  and s2.[EmailAddress] = s1.emailadd
  and s2.[AddressNorm]  = s1.Address 
  and s2.[MiddleName]   = s1.MiddleName 
  and s2.[CCExpYYMM]    = s1.CCExpYYMM  
  and s2.[CCLastFour]   = s1.CCLastFour 

从那里开始分数并离开连接到插入表,这样您就可以避免插入已经存在较高分数的数据。一般来说,不要尝试构建复杂的查询来消除更高的分数,除非它是一个非常简单的查询,如99是s2。[CCLastFour]&lt;&gt; s1.CCLastFour。

答案 1 :(得分:0)

我的答案非常像Frisbee(每个测试组之间都有UNION ALL)所以我不打算发布SQL。我要补充的是,虽然这是你可能想要的解决方案,但即使这种基于集合的方法在1900万行表上运行时也会是一个非常强大的查询。据我所知,你正试图找到你桌子上的人之间的关联度或相似度。如果我理解正确的话,你想比较每个人和每个人。如果名称和地址与DOB(或其他)的匹配得分为100,则使下一个测试稍微不那么严格,并指定较低的分数等等。随着测试变得越来越弱,自我联接变得越来越像交叉联接 - 你会获得更多的点击率。如果您在测试的列中具有较低的基数(许多重复值),则最终可能会产生数百万(或数十亿甚至数万亿)的行。小心只测试会返回实用价值结果的关联。对于(极端)示例,如果您仅基于性别测试相似性,那么您最终会有两个950万行交叉连接。