返回SQL Server

时间:2018-04-19 16:55:40

标签: sql sql-server duplicates matching ranking

我有一个查询正在对记录进行条件匹配,以便在人员表上创建“黄金记录”。在这样做时,另一个要求是通过最常见的事件对匹配中的属性进行排名,以便黄金记录具有最佳值。这是我正在处理的过程的过度简化,但它确实显示了我正在尝试做的事情。

我相信我的工作正在发挥作用,但我认为可能有更好的方法。我已将每个逻辑步骤分成临时表,以更好地展示我正在做的事情。

  • Step1:自我加入以匹配多个规则。这个或那个......
  • 步骤2:通过计数(发生)
  • 对匹配记录中的属性进行排名
  • 步骤3:为比赛创建一个黄金记录并选择哪个 属性赢取基于先前步骤的计数Step4:分配Golden 记录到原始人员记录

以下是一些示例数据和我现有的查询,以显示逻辑步骤:

CREATE TABLE Persons (
ID int IDENTITY(1,1),
FirstName varchar(255),  
LastName varchar(255),    
Address1 varchar(255),
City varchar(255),
State varchar(255),
BDay Varchar(255),
Email Varchar(255)
);


INSERT INTO Persons
SELECT 'RICK', 'ALLEN', '44 Street', 'Minneapolis', 'MN', '1/2/1970','help@test.com'
UNION ALL
SELECT 'JENNIFER', 'ALLEN', '123 Street', 'Minneapolis', 'MN', '4/8/1980','test@test.com'
UNION ALL
SELECT 'JENNIFER', 'ALLEN', '123 Street', 'Minneapolis', 'MN', '4/8/1981','test@test.com'
UNION ALL
SELECT 'JENNIFER', 'ALLEN', '42 Street', 'Minneapolis', 'MN', '4/8/1980','test@test.com'
UNION ALL
SELECT 'JENNIFER', 'ALLEN', '123 Street', 'Minneapolis', 'MN', '4/8/1980','test2@test.com'
UNION ALL
SELECT 'STEVEN', 'ALLEN', '555 Street', 'Minneapolis', 'MN', '2/8/1980','help@test.com'




SELECT * FROM Persons;

SELECT  p1.FirstName
, p1.LastName
, p1.Address1
, p1.BDay
, p1.Email
, COUNT(1) OVER (PARTITION BY p2.ID) AS [MatchCount]
, COUNT(1) OVER (PARTITION BY p2.ID, p1.FirstName) AS [MatchCount_FirstName]
, COUNT(1) OVER (PARTITION BY p2.ID, p1.Address1) AS [MatchCount_Address1]
, COUNT(1) OVER (PARTITION BY p2.ID, p1.BDay) AS [MatchCount_BDay]
, COUNT(1) OVER (PARTITION BY p2.ID, p1.Email) AS [MatchCount_Email]
, p1.ID as OriginalID
, p2.ID as DupeID
INTO #tmp_dups
FROM Persons p1
INNER JOIN    Persons p2
ON      p1.FirstName = p2.FirstName AND
        p1.LastName = p2.LastName AND
        (
            p1.Address1 = p2.Address1 OR
            p1.BDay = p2.BDay OR
            p1.Email = p2.Email
        )

SELECT  MIN(a.OriginalID) as OriginalID
    , a.DupeID
INTO #tmp_matches
FROM #tmp_dups AS a
GROUP BY a.DupeID

SELECT 
    *
    , ROW_NUMBER() OVER (PARTITION BY [DupeID] ORDER BY [MatchCount_FirstName] DESC) AS [Match_RankByCount_FirstName]
    , ROW_NUMBER() OVER (PARTITION BY [DupeID] ORDER BY [MatchCount_Address1] DESC) AS [Match_RankByCount_Address1]
    , ROW_NUMBER() OVER (PARTITION BY [DupeID] ORDER BY [MatchCount_BDay] DESC) AS [Match_RankByCount_BDay]
    , ROW_NUMBER() OVER (PARTITION BY [DupeID] ORDER BY [MatchCount_Email] DESC) AS [Match_RankByCount_Email]
INTO #tmp_rankdups
FROM #tmp_dups


SELECT ROW_NUMBER() OVER(ORDER BY a.DupeID) + 100000 AS GoldenRecordID  
    --, MIN(a.OriginalID) as OriginalID
    , a.DupeID        
    , MAX(CASE WHEN [Match_RankByCount_FirstName] = 1 THEN a.[FirstName] END) AS [FirstName]
    , a.LastName
    , MAX(CASE WHEN [Match_RankByCount_Address1] = 1 THEN a.[address1] END) AS [address1]
    , MAX(CASE WHEN [Match_RankByCount_BDay] = 1 THEN a.[BDay] END) AS [BDay]
    , MIN(CASE WHEN [Match_RankByCount_Email] = 1 THEN a.[Email] END) AS [Email]
INTO #tmp_goldenrecords
FROM #tmp_rankdups AS a
GROUP BY a.DupeID        
    , a.LastName
HAVING MIN(a.OriginalID) = a.DupeID

SELECT * FROM Persons AS p
INNER JOIN #tmp_matches AS m
    ON p.ID = m.DupeID
INNER JOIN #tmp_goldenrecords AS g
    ON g.DupeID = m.OriginalID

0 个答案:

没有答案