我的客户表大约有130万行。试图匹配客户的计划没有按预期运作,导致大量客户无法与现有客户匹配。
该计划最大的问题之一是它试图使用结算电子邮件作为匹配。与此相关的计划是,我们拥有来自许多不同市场的客户,其中包括加密客户电子邮件的市场,例如Amazon.com。
因此,电子邮件匹配只是特定网站和市场的一种选择。
我想知道是否有更好的方法(比我在下面尝试的那样)在我的客户表上进行“4-way match”,以便我可以看到是否有很多匹配不同的列。< / p>
例如:
ListAEmail | ListAFullname | ListAAddress | ListAPhone
------------------------------------------------------------------------------
2b************@marketplace.amazon.com | jeff neal | 49 willow | 4*******7
------------------------------------------------------------------------------
这将是4对2的匹配
ListBEmail | ListBFullname | ListBAddress | ListBPhone
------------------------------------------------------------------------------
1********@gmail.com | jeff neal | 7-49 willow | 4*******1
------------------------------------------------------------------------------
它会在[ListBFullname]上匹配100%,在[ListBAddress]上匹配83.33%,所以我认为这是匹配客户,我想为他的订单分配相同的客户ID。
我认为下面的存储过程(我已经从this article修改过)可以以某种方式进行优化,但我没有看到它。它已运行超过15个小时。任何帮助将不胜感激。
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
Create procedure [dbo].[FuzzyMatchBigData] (
@MatchScore float = .8
) AS
TRUNCATE TABLE [dbo].[TempMatch]
INSERT INTO [dbo].[TempMatch]
SELECT ListA.BillEmail as ListAEmail
,ListA.FirstName+' '+ListA.LastName AS ListAFullname
,ListA.Numbers AS ListAAddress
,ListA.Phone as ListAPhone
,ListB.BillEmail as ListBEmail
,ListB.FirstName+' '+ListB.LastName AS ListBFullname
,ListB.Numbers AS ListBAddress
,ListB.Phone as ListBPhone
,CAST(0 AS float) as Matchscore0
,CAST(0 AS float) as Matchscore1
,CAST(0 AS float) as Matchscore2
,CAST(0 AS float) as Matchscore3
FROM (
SELECT CASE WHEN LOWER(BillEmail) = '' OR LOWER(BillEmail) = 'N/A' THEN CONVERT(NVARCHAR,ABS(CAST(CAST(NEWID() AS VARBINARY) AS INT)))
ELSE LOWER(BillEmail)
END as BillEmail
,CASE WHEN LOWER([BillFirstName] + ' ' + [BillLastName]) = '' THEN CONVERT(NVARCHAR,ABS(CAST(CAST(NEWID() AS VARBINARY) AS INT)))
ELSE LOWER([BillFirstName] + ' ' + [BillLastName])
END as BillName
,LOWER([BillFirstName]) as FirstName
,LOWER([BillLastName]) as LastName
,LOWER(BillCity) as BillCity
,LOWER(BillCompany) as BillCompany
,LOWER(BillCountryCode) as BillCountryCode
,CASE WHEN replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
LOWER(BillPhone),'-',''),')',''),'(',''),' ',''),'*',''),',',''),'+',''),'&',''),'.',''),'=',''),'/','') = '' THEN CONVERT(NVARCHAR,ABS(CAST(CAST(NEWID() AS VARBINARY) AS INT)))
ELSE replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
LOWER(BillPhone),'-',''),')',''),'(',''),' ',''),'*',''),',',''),'+',''),'&',''),'.',''),'=',''),'/','')
END as Phone
,LOWER(BillPostalCode) as BillPostalCode
,LOWER(BillStateProvCode) as BillStateProvCode
,replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
LOWER([BillStreet1] + ' ' + [BillStreet2]),'-',''),')',''),'(',''),'*',''),',',''),'+',''),'&',''),'.',''),'=',''),'/',''),'#','') as 'Address'
,CASE WHEN LOWER(REPLACE(REPLACE(BillNumbers, CHAR(13), ' '), CHAR(10), ' ')) = '' THEN CONVERT(NVARCHAR,ABS(CAST(CAST(NEWID() AS VARBINARY) AS INT)))
ELSE LOWER(REPLACE(REPLACE(BillNumbers, CHAR(13), ' '), CHAR(10), ' '))
END as Numbers
FROM (
SELECT [CustomerID]
,CASE WHEN BillEmail = '' OR BillEmail = 'N/A' THEN ShipEmail
ELSE BillEmail
END as BillEmail
,CASE WHEN BillFirstName = '' THEN REPLACE(REPLACE([ShipFirstName], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillFirstName], CHAR(13), ' '), CHAR(10), ' ')
END AS BillFirstName
,CASE WHEN BillLastName = '' THEN REPLACE(REPLACE([ShipLastName], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillLastName], CHAR(13), ' '), CHAR(10), ' ')
END AS BillLastName
,CASE WHEN BillCity = '' THEN REPLACE(REPLACE([ShipCity], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillCity], CHAR(13), ' '), CHAR(10), ' ')
END AS BillCity
,CASE WHEN BillCompany = '' THEN REPLACE(REPLACE([ShipCompany], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillCompany], CHAR(13), ' '), CHAR(10), ' ')
END AS BillCompany
,CASE WHEN BillCountryCode = ''THEN REPLACE(REPLACE([ShipCountryCode], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillCountryCode], CHAR(13), ' '), CHAR(10), ' ')
END as BillCountryCode
,CASE WHEN BillPhone = '' THEN REPLACE(REPLACE([ShipPhone], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillPhone], CHAR(13), ' '), CHAR(10), ' ')
END AS BillPhone
,CASE WHEN BillPostalCode = '' THEN REPLACE(REPLACE([ShipPostalCode], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillPostalCode], CHAR(13), ' '), CHAR(10), ' ')
END AS BillPostalCode
,CASE WHEN BillStateProvCode = '' THEN REPLACE(REPLACE([ShipStateProvCode], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillStateProvCode], CHAR(13), ' '), CHAR(10), ' ')
END AS BillStateProvCode
,CASE WHEN BillStreet1 = '' THEN REPLACE(REPLACE([ShipStreet1], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillStreet1], CHAR(13), ' '), CHAR(10), ' ')
END AS BillStreet1
,CASE WHEN SUBSTRING([BillStreet1], 1, CHARINDEX(' ', [BillStreet1],CHARINDEX(' ', BillStreet1) + 1)) = ''
THEN SUBSTRING([ShipStreet1], 1, CHARINDEX(' ', [ShipStreet1],CHARINDEX(' ', ShipStreet1) + 1))
ELSE SUBSTRING([BillStreet1], 1, CHARINDEX(' ', [BillStreet1],CHARINDEX(' ', BillStreet1) + 1))
END as BillNumbers
,CASE WHEN BillStreet2 = '' THEN REPLACE(REPLACE([ShipStreet2], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillStreet2], CHAR(13), ' '), CHAR(10), ' ')
END AS BillStreet2
,CASE WHEN BillStreet3 = '' THEN REPLACE(REPLACE([ShipStreet3], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillStreet3], CHAR(13), ' '), CHAR(10), ' ')
END AS BillStreet3
FROM [Customer]
) AS Data2
) ListA
JOIN (
SELECT CASE WHEN LOWER(BillEmail) = '' OR LOWER(BillEmail) = 'N/A' THEN CONVERT(NVARCHAR,ABS(CAST(CAST(NEWID() AS VARBINARY) AS INT)))
ELSE LOWER(BillEmail)
END as BillEmail
,CASE WHEN LOWER([BillFirstName] + ' ' + [BillLastName]) = '' THEN CONVERT(NVARCHAR,ABS(CAST(CAST(NEWID() AS VARBINARY) AS INT)))
ELSE LOWER([BillFirstName] + ' ' + [BillLastName])
END as BillName
,LOWER([BillFirstName]) as FirstName
,LOWER([BillLastName]) as LastName
,LOWER(BillCity) as BillCity
,LOWER(BillCompany) as BillCompany
,LOWER(BillCountryCode) as BillCountryCode
,CASE WHEN replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
LOWER(BillPhone),'-',''),')',''),'(',''),' ',''),'*',''),',',''),'+',''),'&',''),'.',''),'=',''),'/','') = '' THEN CONVERT(NVARCHAR,ABS(CAST(CAST(NEWID() AS VARBINARY) AS INT)))
ELSE replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
LOWER(BillPhone),'-',''),')',''),'(',''),' ',''),'*',''),',',''),'+',''),'&',''),'.',''),'=',''),'/','')
END as Phone
,LOWER(BillPostalCode) as BillPostalCode
,LOWER(BillStateProvCode) as BillStateProvCode
,replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
replace(
LOWER([BillStreet1] + ' ' + [BillStreet2]),'-',''),')',''),'(',''),'*',''),',',''),'+',''),'&',''),'.',''),'=',''),'/',''),'#','') as 'Address'
,CASE WHEN LOWER(REPLACE(REPLACE(BillNumbers, CHAR(13), ' '), CHAR(10), ' ')) = '' THEN CONVERT(NVARCHAR,ABS(CAST(CAST(NEWID() AS VARBINARY) AS INT)))
ELSE LOWER(REPLACE(REPLACE(BillNumbers, CHAR(13), ' '), CHAR(10), ' '))
END as Numbers
FROM (
SELECT [CustomerID]
,CASE WHEN BillEmail = '' OR BillEmail = 'N/A' THEN ShipEmail
ELSE BillEmail
END as BillEmail
,CASE WHEN BillFirstName = '' THEN REPLACE(REPLACE([ShipFirstName], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillFirstName], CHAR(13), ' '), CHAR(10), ' ')
END AS BillFirstName
,CASE WHEN BillLastName = '' THEN REPLACE(REPLACE([ShipLastName], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillLastName], CHAR(13), ' '), CHAR(10), ' ')
END AS BillLastName
,CASE WHEN BillCity = '' THEN REPLACE(REPLACE([ShipCity], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillCity], CHAR(13), ' '), CHAR(10), ' ')
END AS BillCity
,CASE WHEN BillCompany = '' THEN REPLACE(REPLACE([ShipCompany], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillCompany], CHAR(13), ' '), CHAR(10), ' ')
END AS BillCompany
,CASE WHEN BillCountryCode = ''THEN REPLACE(REPLACE([ShipCountryCode], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillCountryCode], CHAR(13), ' '), CHAR(10), ' ')
END as BillCountryCode
,CASE WHEN BillPhone = '' THEN REPLACE(REPLACE([ShipPhone], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillPhone], CHAR(13), ' '), CHAR(10), ' ')
END AS BillPhone
,CASE WHEN BillPostalCode = '' THEN REPLACE(REPLACE([ShipPostalCode], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillPostalCode], CHAR(13), ' '), CHAR(10), ' ')
END AS BillPostalCode
,CASE WHEN BillStateProvCode = '' THEN REPLACE(REPLACE([ShipStateProvCode], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillStateProvCode], CHAR(13), ' '), CHAR(10), ' ')
END AS BillStateProvCode
,CASE WHEN BillStreet1 = '' THEN REPLACE(REPLACE([ShipStreet1], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillStreet1], CHAR(13), ' '), CHAR(10), ' ')
END AS BillStreet1
,CASE WHEN SUBSTRING([BillStreet1], 1, CHARINDEX(' ', [BillStreet1],CHARINDEX(' ', BillStreet1) + 1)) = ''
THEN SUBSTRING([ShipStreet1], 1, CHARINDEX(' ', [ShipStreet1],CHARINDEX(' ', ShipStreet1) + 1))
ELSE SUBSTRING([BillStreet1], 1, CHARINDEX(' ', [BillStreet1],CHARINDEX(' ', BillStreet1) + 1))
END as BillNumbers
,CASE WHEN BillStreet2 = '' THEN REPLACE(REPLACE([ShipStreet2], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillStreet2], CHAR(13), ' '), CHAR(10), ' ')
END AS BillStreet2
,CASE WHEN BillStreet3 = '' THEN REPLACE(REPLACE([ShipStreet3], CHAR(13), ' '), CHAR(10), ' ')
ELSE REPLACE(REPLACE([BillStreet3], CHAR(13), ' '), CHAR(10), ' ')
END AS BillStreet3
FROM [Customer]
)as Data3
) ListB
ON MDS1.mdq.Similarity(ListA.FirstName+' '+ListA.LastName, ListB.FirstName+' '+ListB.LastName, 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.BillEmail,ListB.BillEmail, 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.Numbers,ListB.Numbers, 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.Phone,ListB.Phone, 3, 1.0, @MatchScore) >= @MatchScore
UPDATE [TempMatch]
SET MatchScore0 = MDS1.mdq.Similarity(ListAEmail,ListBEmail, 3, 1.0, @MatchScore)
UPDATE [TempMatch]
SET MatchScore1 = MDS1.mdq.Similarity(ListAFullname, ListBFullname, 3, 1.0, @MatchScore)
UPDATE [TempMatch]
SET MatchScore2 = MDS1.mdq.Similarity(ListAAddress, ListBAddress, 3, 1.0, @MatchScore)
UPDATE [TempMatch]
SET MatchScore3 = MDS1.mdq.Similarity(ListAPhone, ListBPhone, 3, 1.0, @MatchScore)
修改
Per @John Pasquet建议我能够创建一个新表并应用他的建议,并将我的查询缩短到此。
DECLARE @MatchScore float = .8
SELECT *
FROM (
SELECT ListA.CustomerID as ListACustomerID
,ListA.BillEmail as ListAEmail
,ListA.FirstName+' '+ListA.LastName AS ListAFullname
,ListA.Numbers AS ListAAddress
,ListA.Phone as ListAPhone
,ListB.CustomerID as ListBCustomerID
,ListB.BillEmail as ListBEmail
,ListB.FirstName+' '+ListB.LastName AS ListBFullname
,ListB.Numbers AS ListBAddress
,ListB.Phone as ListBPhone
,MDS1.mdq.Similarity(ListA.FirstName+' '+ListA.LastName, ListB.FirstName+' '+ListB.LastName, 3, 1.0, @MatchScore) as NameScore
,MDS1.mdq.Similarity(ListA.BillEmail, ListB.BillEmail, 3, 1.0, @MatchScore) as EmailScore
,MDS1.mdq.Similarity(ListA.Numbers,ListB.Numbers,3,1.0, @MatchScore) as NumberScore
,MDS1.mdq.Similarity(ListA.Phone,ListB.Phone,3,1.0, @MatchScore) as PhoneScore
FROM (
SELECT [CustomerID]
,BillEmail
,BillFirstName AS FirstName
,BillLastName AS LastName
,BillPhone AS Phone
,BillStreet1
,CASE WHEN SUBSTRING([BillStreet1], 1, CHARINDEX(' ', [BillStreet1],CHARINDEX(' ', BillStreet1) + 1)) = ''
THEN SUBSTRING([ShipStreet1], 1, CHARINDEX(' ', [ShipStreet1],CHARINDEX(' ', ShipStreet1) + 1))
ELSE SUBSTRING([BillStreet1], 1, CHARINDEX(' ', [BillStreet1],CHARINDEX(' ', BillStreet1) + 1))
END as Numbers
FROM [CustomerFix]
) ListA
JOIN (
SELECT [CustomerID]
,BillEmail
,BillFirstName AS FirstName
,BillLastName AS LastName
,BillPhone AS Phone
,BillStreet1
,CASE WHEN SUBSTRING([BillStreet1], 1, CHARINDEX(' ', [BillStreet1],CHARINDEX(' ', BillStreet1) + 1)) = ''
THEN SUBSTRING([ShipStreet1], 1, CHARINDEX(' ', [ShipStreet1],CHARINDEX(' ', ShipStreet1) + 1))
ELSE SUBSTRING([BillStreet1], 1, CHARINDEX(' ', [BillStreet1],CHARINDEX(' ', BillStreet1) + 1))
END as Numbers
FROM [CustomerFix]
) ListB
ON MDS1.mdq.Similarity(ListA.FirstName+' '+ListA.LastName, ListB.FirstName+' '+ListB.LastName, 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.BillEmail,ListB.BillEmail, 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.Numbers,ListB.Numbers, 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.Phone,ListB.Phone, 3, 1.0, @MatchScore) >= @MatchScore
) as Data5
WHERE (NameScore+EmailScore+NumberScore+PhoneScore) > 1
我把它作为一个查询来测试,所以我得到的结果很慢但肯定。它仍然是CPU密集型的,因为有130万条记录。我希望在创建其他将清理和更新客户表的存储过程之前,我可以再多一点优化它。
在我获得这个初始数据后,我将制作一个SP,以便在新客户进来时擦洗它们。
编辑#2
添加了其他列并重新索引表,以便通过不比较连接字符串来减少CPU使用率。通过添加2个额外的列并重新编制索引,我看到至少提高了10倍的速度。
DECLARE @MatchScore float = .8
SELECT *
FROM (
SELECT ListA.CustomerID as ListACustomerID
,ListA.BillEmail as ListAEmail
,ListA.[Name] AS ListAFullname
,ListA.Numbers AS ListAAddress
,ListA.Phone as ListAPhone
,ListB.CustomerID as ListBCustomerID
,ListB.BillEmail as ListBEmail
,ListB.[Name] AS ListBFullname
,ListB.Numbers AS ListBAddress
,ListB.Phone as ListBPhone
,MDS1.mdq.Similarity(ListA.[Name], ListB.[Name], 3, 1.0, @MatchScore) as NameScore
,MDS1.mdq.Similarity(ListA.BillEmail, ListB.BillEmail, 3, 1.0, @MatchScore) as EmailScore
,MDS1.mdq.Similarity(ListA.Numbers,ListB.Numbers,3,1.0, @MatchScore) as NumberScore
,MDS1.mdq.Similarity(ListA.Phone,ListB.Phone,3,1.0, @MatchScore) as PhoneScore
FROM (
SELECT [CustomerID]
,BillEmail
,BillFullName as [Name]
,BillPhone AS Phone
,BillNumbers as Numbers
FROM [CustomerFix]
) ListA
JOIN (
SELECT [CustomerID]
,BillEmail
,BillFullName as [Name]
,BillPhone AS Phone
,BillNumbers as Numbers
FROM [CustomerFix]
) ListB
ON MDS1.mdq.Similarity(ListA.[Name], ListB.[Name], 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.BillEmail,ListB.BillEmail, 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.Numbers,ListB.Numbers, 3, 1.0, @MatchScore) >= @MatchScore
OR MDS1.mdq.Similarity(ListA.Phone,ListB.Phone, 3, 1.0, @MatchScore) >= @MatchScore
) as Data5
WHERE (NameScore+EmailScore+NumberScore+PhoneScore) > 1