表名[dbo].[SourceData]
有1900万行。
我正在针对此表运行while循环,并根据匹配条件将数据加载到另一个表中。虽然循环比以往任何时候都要长。
示例代码如下。 Sourcedata
表格有seqno
,这是唯一的identity
列(主键)。 Firstname,lastname,address,emailaddress也有单独的NC索引。
create table #holdscore
(
seqno bigint,
associatedseq bigint,
scrore int,
status varchar(20),
customerid varchar(30)
CONSTRAINT [PK_SourceScores]
PRIMARY KEY CLUSTERED (seqno ASC, associatedseq ASC)
)
Create table #loop
(
seqno bigint primary key clustered,
Flag varchar(1) NULL
)
Insert #loop (seqno)
select distinct TOP 1000 seqno
from [dbo].[SourceData]
order by seqno
Declare @seqno bigint
Declare @firstname Nvarchar(100)
Declare @lastname Nvarchar(100)
Declare @phonenum nvarchar(100)
Declare @emailadd Nvarchar(100)
Declare @Address Nvarchar(250)
Declare @MiddleName nvarchar(50)
Declare @CCExpYYMM nvarchar(4)
Declare @CCLastFour nvarchar(4)
While ((select count(*) from #Loop where flag is null)>0)
Begin
Select top 1 @seqno = seqno from #Loop where flag is null
Select @firstname = [FirstName],
@lastname = [LastName],
@phonenum = [PhoneNorm],
@emailadd = [EmailAddress],
@Address = [AddressNorm],
@MiddleName = [MiddleName],
@CCExpYYMM = [CCExpYYMM],
@CCLastFour = [CCLastFour]
from [dbo].[SourceData]
where seqno = @seqno
INSERT #holdscore
select
orginalseqno, associatedseq, score,
case when score >= 80 Then 'Match'
when score < 80 Then 'Review'
end as Status,
customerid
from
(select
@seqno orginalseqno, seqno as associatedseq,
customerid,
case
when [FirstName] = @firstname
and [LastName] = @lastname
and [PhoneNorm] = @phonenum
and [EmailAddress] = @emailadd
and [AddressNorm] = @Address
and [MiddleName] = @MiddleName
and [CCExpYYMM] = @CCExpYYMM
and [CCLastFour] = @CCLastFour THEN '100'
when [FirstName] = @firstname
and [LastName] = @lastname
and [PhoneNorm] = @phonenum
and [EmailAddress] = @emailadd
and [AddressNorm] = @Address
and [MiddleName] = @MiddleName
and [CCExpYYMM] = @CCExpYYMM THEN '99'
when [FirstName] = @firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address and [MiddleName] = @MiddleName and [CCLastFour] = @CCLastFour THEN '99'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address and [MiddleName] = @MiddleName Then '98'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address Then '93'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd Then '83'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum Then '68'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd Then '63'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [AddressNorm] = @Address Then '78'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd and [AddressNorm] = @Address Then '73'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [AddressNorm] = @Address Then '58'
WHEN [FirstName]=@firstname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName Then '73'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName THEN '75'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address Then '70'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd THEN '60'
END AS Score
From [dbo].[SourceData]
)A
where A.Score is not null
OPTION (MAXDOP 8)
Update #Loop
set Flag = 'Y'
where seqno =@seqno and Flag is null
end
对于1000个独特的seqno,完成需要1个多小时。我需要将1900万行相互比较并将其加载到表中。请帮助我加快这个过程。这样我就可以及时加载数据。 SSIS也将起作用。
答案 0 :(得分:0)
构建于此
select s1.seqno as orginalseqno, s2,seqno as associatedseq, 100, 'Match', s2.customerid
from [SourceData] as s1
join [SourceData] as s2
on s2.[FirstName] = s1.firstname
and s2.[LastName] = s1.lastname
and s2.[PhoneNorm] = s1.phonenum
and s2.[EmailAddress] = s1.emailadd
and s2.[AddressNorm] = s1.Address
and s2.[MiddleName] = s1.MiddleName
and s2.[CCExpYYMM] = s1.CCExpYYMM
and s2.[CCLastFour] = s1.CCLastFour
从那里开始分数并离开连接到插入表,这样您就可以避免插入已经存在较高分数的数据。一般来说,不要尝试构建复杂的查询来消除更高的分数,除非它是一个非常简单的查询,如99是s2。[CCLastFour]&lt;&gt; s1.CCLastFour。
答案 1 :(得分:0)
我的答案非常像Frisbee(每个测试组之间都有UNION ALL)所以我不打算发布SQL。我要补充的是,虽然这是你可能想要的解决方案,但即使这种基于集合的方法在1900万行表上运行时也会是一个非常强大的查询。据我所知,你正试图找到你桌子上的人之间的关联度或相似度。如果我理解正确的话,你想比较每个人和每个人。如果名称和地址与DOB(或其他)的匹配得分为100,则使下一个测试稍微不那么严格,并指定较低的分数等等。随着测试变得越来越弱,自我联接变得越来越像交叉联接 - 你会获得更多的点击率。如果您在测试的列中具有较低的基数(许多重复值),则最终可能会产生数百万(或数十亿甚至数万亿)的行。小心只测试会返回实用价值结果的关联。对于(极端)示例,如果您仅基于性别测试相似性,那么您最终会有两个950万行交叉连接。