TSQL链接数据并拆分为匹配和不匹配的表

时间:2013-05-07 15:12:12

标签: sql sql-server tsql

我有两个数据表,其中包含如下数据: -

|  id   | name |   dob    |          | name |   dob    |
|-------|------|----------|          |------|----------|  
| 12345 | ABC  | 20010301 |          | ABC  | 20010301 |  - matching record
| 45678 | DEF  | 20010425 |          | XYZ  | 20010301 |  - unmatched record

是否可以编写一个比较两个表的查询,然后创建一个匹配和不匹配的表,只保留orignal表结构/数据?

  Match Table        Unmatched Table
|  id   | rank |     |  id   | rank |   
|-------|------|     |-------|------|
| 12345 |  1   |     | 45678 | NULL |

我正在尝试使用MERGE,但我必须插入/更新一个源表,并且我已经达到了tsql的天花板 - 我还将处理超过30,000,000行的数据集 - 任何建议/建议?
sql(字段不匹配,但原则是那里)我到目前为止的情况如下?

Create TABLE #Cohort ([ID] varchar(4),[match rank] int)
INSERT INTO #Cohort ([ID],[match rank]) VALUES('aaaa',NULL)
INSERT INTO #Cohort ([ID],[match rank]) VALUES('bbbb',NULL)
INSERT INTO #Cohort ([ID],[match rank]) VALUES('cccc',NULL)
INSERT INTO #Cohort ([ID],[match rank]) VALUES('dddd',NULL)

Create TABLE #link ([ID] varchar(4),[match rank] int)
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL)
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL)
INSERT INTO #link ([ID],[match rank]) VALUES('aaaa',NULL)
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL)

Create TABLE #Matches ([ID] varchar(4),[match rank] int)
Create TABLE #Unmatched ([ID] varchar(4),[match rank] int)

MERGE #Cohort tg
USING (SELECT distinct c.[ID], 1 as [match rank] 
        from #Cohort c
        INNER JOIN #link as h on c.[ID]=h.[ID]) sc
ON (tg.[ID] = sc.[ID] )
WHEN NOT MATCHED BY TARGET
     THEN INSERT([ID],[match rank]) VALUES(sc.[ID],sc.[match rank])
WHEN NOT MATCHED BY SOURCE
     THEN DELETE
OUTPUT Deleted.* INTO #Unmatched;

3 个答案:

答案 0 :(得分:0)

查找匹配/不匹配记录的标准方法是执行左连接并在左连接表中查找NULL。

SELECT t1.id, COUNT(t2.name) AS rank
INTO #MatchedTable
FROM Table1 t1
LEFT JOIN Table2 t2 ON t2.name = t1.name
WHERE t2.name IS NOT NULL
GROUP BY t1.id
ORDER BY t1.id

SELECT t1.id, NULL AS rank
INTO #UnmatchedTable
FROM Table1 t1
LEFT JOIN Table2 t2 ON t2.name = t1.name
WHERE t2.name IS NULL
GROUP BY t1.id
ORDER BY t1.id

我希望这会有所帮助。

答案 1 :(得分:0)

使用CTE,最后您将在#Matched匹配的行和#Unmatched不匹配的行上。按照目前的情况,您的MERGE语句会删除#cohort表中的行,只留下aaaa值的行。

CREATE TABLE #Cohort ([ID] VARCHAR(4),[MATCH RANK] INT)
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('aaaa',NULL)
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('bbbb',NULL)
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('cccc',NULL)
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('dddd',NULL)

CREATE TABLE #link ([ID] VARCHAR(4),[MATCH RANK] INT)
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL)
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL)
INSERT INTO #link ([ID],[MATCH RANK]) VALUES('aaaa',NULL)
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL)

CREATE TABLE #Matches ([ID] VARCHAR(4),[MATCH RANK] INT)
CREATE TABLE #Unmatched ([ID] VARCHAR(4),[MATCH RANK] INT)

;WITH MatchedTbl AS
(
   SELECT DISTINCT c.[ID], c.[MATCH RANK] 
     FROM #Cohort c
     INNER JOIN #link h ON c.[ID] = h.[ID]
)
INSERT INTO #Matches
SELECT c.[ID], c.[MATCH RANK]
  FROM MatchedTbl c

;WITH NonMatchedTbl AS
(
    SELECT DISTINCT l.[ID], l.[MATCH RANK] 
      FROM #link l 
     WHERE l.ID NOT IN (SELECT DISTINCT ID FROM #cohort)
)
INSERT INTO #Unmatched
SELECT [ID], [MATCH RANK]
  FROM NonMatchedTbl

SELECT * FROM #Cohort
SELECT * FROM #Link
SELECT * FROM #Matches
SELECT * FROM #Unmatched

DROP TABLE #Cohort
DROP TABLE #link
DROP TABLE #Matches
DROP TABLE #Unmatched

答案 2 :(得分:0)

如果您正在处理MASSIVE数据,可以尝试两件事。如果您仍想使用合并语句,则可以尝试在BATCHES中执行此操作,而不是一次性完成。或者您可以指定批次并进行直接插入。无论哪种方式,我会建议一个可能的暂存区域,创建一个索引,然后插入。使用ntile函数分配批次。下面的自解压示例将在SQL Server 2008或更高版本中运行:

declare @Person Table ( personID int identity, person varchar(8));

insert into @Person values ('Brett'),('Sean'),('Chad'),('Michael'),('Ray'),('Erik'),('Quyen'),('John'),('Tim');

declare @Orders table ( OrderID int identity, PersonID int, Desciption varchar(32), Amount int);

insert into @Orders values (1, 'Shirt', 20),(1, 'Shoes', 50),(2, 'Shirt', 22),(2, 'Shoes', 52),(3, 'Shirt', 20),(3, 'Shoes', 50),(3, 'Hat', 20),(4, 'Shirt', 20),(5, 'Shirt', 20),(5, 'Pants', 30),
(6, 'Shirt', 20),(6, 'RunningShoes', 70),(7, 'Shirt', 22),(7, 'Shoes', 40),(7, 'Coat', 80)

declare @Storage table ( batch int, personid int, person varchar(8), orderid int, Desciption varchar(32), amount int);

insert into @Storage

Select 
    ntile(5) over(order by p.PersonID)  
-- ntile does the number n inside across entire dataset so if I had 500 items 100 would each be different batch
,   p.personID
,   p.person
,   o.OrderID
,   o.Desciption
,   o.Amount
from @Person p
    left join @Orders o on p.personID = o.PersonID
-- left join assures that when orders do not exist I still get the person

declare @Cursor int = 5
-- I can set a cursor for inserts based on batching.

-- pretend tables for matching
declare @Matched table ( personid int, person varchar(8), orderid int, Desciption varchar(32), amount int);
declare @UnMatched table ( personid int, person varchar(8), orderid int, description varchar(32), amount int);


insert into @Matched
select
    personID
,   person
,   OrderID
,   Desciption
,   Amount
from @Storage
where batch = @Cursor
and orderID is not null


insert into @UnMatched
select 
    personID
,   person
,   OrderID
,   Desciption
,   Amount
from @Storage
where batch = @Cursor
and orderID is null

select * From @Matched
select * From @UnMatched

我的示例非常简单,但您可以更改'cursor'变量以查看从分段中发生的不同结果。由于批处理,我不会一次运行整个集合,我可以将数据存储在存储中,然后根据将要更改的游标或整数编写一个过程来执行插入操作。您甚至可以为位引用添加列,无论数据是否已处理。