我需要在SQL Server 2016中选择重复记录的子集。以下是数据集和使用的代码。我只需要选择以红色突出显示的重复项。基本上,我只需要那些具有匹配的LName,FName,dateOfBirth,StreetAddress值以及在Source中使用中立NUll的重复记录。同时,我只需要在上述字段中也匹配并且源值为“ Company XYZ”的那些记录
IF OBJECT_ID('tempdb..#Dataset') IS NOT NULL DROP TABLE #Dataset
GO
create table #Dataset
(
ID int not null,
LName varchar(50) null,
Fname varchar(50) null,
DateOfBirth varchar(50) null,
StreetAddress varchar(50) null,
Source varchar(50) null,
)
insert into #Dataset (ID, LName, Fname, DateOfBirth, StreetAddress, Source)
values
('1', 'John', 'Ganske', '37171', ' 1223 Sunrise St', 'Company XYZ'),
('2', 'John', 'Ganske', '37171', ' 1233 Sunrise St', 'Company XYZ'),
('4', 'Brent', 'Paine', '20723', ' 5443 Fox Dr', Null),
('3', 'Brent', 'Paine', '20723', ' 5443 Fox Dr', 'Company XYZ'),
('5', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', Null),
('6', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', Null),
('7', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', 'Company XYZ'),
('8', 'Timothy', 'Johnson', '36165', ' 1278 Lee H-W', Null),
('9', 'Timothy', 'Johnson', '36165', ' 1278 Lee H-W', Null),
('10', 'Judy', 'Wilson', '32579', ' 5678 Dotties Dr', 'Company XYZ'),
('12', 'Peter', 'Pan', '37507', NULL, Null),
('11', 'Peter', 'Pan', '37507', NULL, 'Company XYZ');
--select * from #Dataset
select d.ID, d.LName, d.Fname, d.DateOfBirth, d.StreetAddress, d.Source
from #Dataset d
inner join (select
LName, Fname, DateOfBirth, StreetAddress
from #Dataset
--where Source is not null
group by
LName, Fname, DateOfBirth, StreetAddress
having count(*) > 1 ) b
on d.LName = b.LName
and
d.Fname = b.Fname
and
d.DateOfBirth = b.DateOfBirth
and
d.StreetAddress = b.StreetAddress
left outer join (select min(ID) as ID from #Dataset
group by LName, Fname, DateOfBirth, StreetAddress
having count(*) > 1 ) c
on d.ID = c.ID
我的输出如下所示:
答案 0 :(得分:2)
您可以使用ROW_NUMBER
:
WITH cte AS (
SELECT *,ROW_NUMBER() OVER(PARTITION BY LName,Fname,DateOfBirth,StreetAddress
ORDER BY ID DESC) rn
FROM #Dataset
)
SELECT *
FROM cte
WHERE rn > 1
ORDER BY ID;
编辑:
WITH cte AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress
ORDER BY ID DESC) rn,
SUM(CASE WHEN Source = 'Company XYZ' THEN 1 ELSE 0 END)
OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) AS cnt
FROM #Dataset
)
SELECT *
FROM cte
WHERE rn > 1
AND cnt > 0
AND [Source] IS NULL
ORDER BY ID;
编辑2 :
WITH cte AS (
SELECT *,
SUM(CASE WHEN Source IS NULL THEN 1 ELSE 0 END) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) c1,
SUM(CASE WHEN Source = 'Company XYZ' THEN 1 ELSE 0 END) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) AS c2,
COUNT(*) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) c3
FROM #Dataset
)
SELECT *
FROM cte
WHERE c1 > 0
AND c2 > 0
AND c3 > 1
AND Source IS NULL
ORDER BY ID;