我有一组数据需要选择最佳匹配。每条记录都有一个名称和CNum。具有相同“名称”的每条记录应具有相同的“CNum”;实际上,一些'Name'匹配具有相同的CNum而一些不具有(这是要解决的问题)。我需要确定哪个CNum更好,并使用单个CNum更新所有'Name'匹配。
我使用ParentId列更新了表格以显示匹配的名称,并使用SubParentId标记匹配的“名称”和“CNum”以帮助提取所需的结果(并使其更容易查看匹配)。
为了帮助确定哪个CNum在“名称”组中比另一个更好,每个记录都分为两列:'ScoreA'和'ScoreB';得分越低越好。以下是我用来确定哪个CNum最好的规则:
假设:如果他们有相同的姓名和CNum他们的分数将是相同的
是否有一种很好的方法可以应用上述规则来获得我正在寻找的结果?
以下是我正在寻找的数据样本和结果,并在插入语句旁边注明了预期的获胜结果:
-- create table
CREATE TABLE Results
(
Id INT NOT NULL IDENTITY( 1, 1 ) PRIMARY KEY,
Name VARCHAR(200) NULL,
CNum NVARCHAR(100) NULL,
Region NVARCHAR(3) NULL,
ScoreA INT NULL,
ScoreB INT NULL,
ParentId INT NULL,
SubParentId INT NULL,
NoMatch BIT NOT NULL DEFAULT(0)
)
GO
-- insert data
-- Leave as is: they are all the same
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
-- Acme Co: winner noted below --> best ScoreA
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '269415003', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '269415003', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '269415003', 'AB', '-13460', '-23' ) -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '856545', 'AB', '-22', '16' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '856545', 'AB', '-22', '16' )
-- Zuland Ltd: winner noted below --> best ScoreB
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-28' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-28' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '5603', 'ON', '-13455', '-30' ) -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '5603', 'ON', '-13455', '-23' )
-- Emco Inc: winner noted below --> AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '5695003', 'ON', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '5695003', 'AB', '-668', '13' ) -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '995588', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '995588', 'WY', '-668', '13' )
-- Zemco Inc: No Winner --> No AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zemco Inc', '5695003', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zemco Inc', '5695003', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zemco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zemco Inc', '995588', 'WY', '-668', '13' )
-- Texco Inc: No Winner --> No AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Texco Inc', '234JJJ', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Texco Inc', '555552', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Texco Inc', '234JJJ', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Texco Inc', '555552', 'WY', '-668', '13' )
-- Grasslands: Leave as is --> they are all the same
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
-- Mike Inc: No Match --> more than 1 'AB' with tied scores
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '234JJJ', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '555552', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '234JJJ', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '555552222', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '90210', 'KT', '-668', '13' )
GO
-- set parent id matched on Name
UPDATE r
SET r.ParentId = COALESCE( r1.Id, r.Id )
FROM Results r
LEFT JOIN Results r1
ON r.Name = r1.NAME
GO
-- set sub-parent id matched on Name and CNum
UPDATE r
SET r.SubParentId = COALESCE( r1.Id, r.Id )
FROM Results r
LEFT JOIN Results r1
ON r.Name = r1.Name AND
r.CNum = r1.CNum
GO
答案 0 :(得分:1)
所以对于你给出的规则,这就是我提出的。如果区域规则(规则4)不是“AB”,那么将来可能无效的唯一事情就是它。由于它首先按字母顺序排列在这里的区域,我可以使用下面的代码:
UPDATE r2
SET [CNum] = A.[CNum]
FROM (
SELECT [Id] ,
[Name] ,
[CNum] ,
[Region] ,
[ScoreA] ,
[ScoreB] ,
[ParentId] ,
[SubParentId] ,
[NoMatch],
ROW_NUMBER() OVER (PARTITION BY [Name] ORDER BY CAST([ScoreA] AS INT) ASC, CAST(ScoreB AS INT) ASC, [Region] ASC) AS RowNum
FROM [dbo].[Results] AS r
) AS A
INNER JOIN [dbo].[Results] AS r2 ON [r2].[Name] = [A].[Name] AND [r2].[Id] != [A].[Id]
WHERE [RowNum] = 1