我正在研究文档管理系统。有些文件是从另一个系统导入的。由于错误,其中一些导入了两次。我需要删除重复项。我有来自以前系统的文档ID,但不能只删除,因为有些文档与多个帐户关联,并且应该在那里两次,所以我也要检查它。关联的值位于不同的表中。我已经创建了以下脚本来提供要删除的文档ID,但它非常慢(它已经在一个记录少于200万的表上运行了四天)。
declare @docidtodelete int
declare @docid int
declare @sourcedocid varchar(12)
declare @taxid decimal(9,0)
declare @account bigint
select @docid = MIN(d.docid) from DOCS d
inner join CONTENTS c on d.DOCID = c.DOCID and c.FOLID=1
while @docid is not null
begin
--get the source document id for this document
select @sourcedocid = val from VTAB0031 where IDXID=31 and DOCID=@docid
-- see if there is another document with the same source document id
select @docidtodelete = isnull(MAX(v.docid),0) from VTAB0031 v
inner join CONTENTS c on v.DOCID = c.DOCID and c.FOLID=1
where IDXID=31 and VAL = @sourcedocid
if @docid<@docidtodelete -- we have a possible duplicate so lets check and see if it matches on account
begin
select @account = val from VTAB0002 where IDXID=2 and DOCID=@docid
select @docidtodelete = isnull(max(v.docid),0) from VTAB0002 v
where IDXID=2 and VAL = @account and v.DOCID=@docidtodelete
if @docid<@docidtodelete -- we still have a possible duplicate so lets check and see if it matches on taxid
begin
select @taxid = val from VTAB0006 where IDXID=6 and DOCID=@docid
select @docidtodelete = isnull(max(v.docid),0) from VTAB0006 v
where IDXID=6 and VAL = @taxid and v.DOCID = @docidtodelete
if @docid<@docidtodelete -- we still have a match so delete
begin
insert into deletedDuplicates values(@docidtodelete ,@docid)
end
end
end
select @docid = MIN(d.docid) from DOCS d
inner join CONTENTS c on d.DOCID = c.DOCID and c.FOLID=1
where d.DOCID > @docid
end
答案 0 :(得分:3)
在使用RDBMS时,最好使用set操作而不是程序操作。
请改为尝试:
select DocIdToDelete,
DocIdToKeep
into deletedDuplicates
from
(
select max(DocId) as DocIdToDelete,
min(DocId) as DocIdToKeep,
SourceDocId,
Account,
TaxId,
Count(*) as NumberMatches
from
(
select d.docid as DocId,
s.val as SourceDocId,
a.val as Account,
t.val as TaxId
from DOCS d
inner join CONTENTS c on c.DOCID = d.DOCID
inner join VTAB0031 s on s.DOCID = d.DOCID
inner join VTAB0002 a on a.DOCID = d.DOCID
inner join VTAB0006 t on t.DOCID = d.DOCID
where c.FOLID = 1
and s.IDXID = 31
and a.IDXID = 2
and t.IDXID = 6
) Summary
group by SourceDocId,
Account,
TaxId
having NumberMatches > 1
) Duplicates
<强>更新强>
我做了一个新查询,应该获取所有重复记录。使用索引也可以更有效地运行。
create table UniqueDocuments
(
DocId int not null,
SourceDocId varchar(12) not null,
Account bigint not null,
TaxId decimal(9,0) not null
primary key clustered (SourceDocId, Account, TaxId)
)
go
insert into UniqueDocuments (DocId, SourceDocId, Account, TaxId)
select min(d.docid) as DocId,
s.val as SourceDocId,
a.val as Account,
t.val as TaxId
from DOCS d
inner join CONTENTS c on c.DOCID = d.DOCID
inner join VTAB0031 s on s.DOCID = d.DOCID
inner join VTAB0002 a on a.DOCID = d.DOCID
inner join VTAB0006 t on t.DOCID = d.DOCID
where c.FOLID = 1
and s.IDXID = 31
and a.IDXID = 2
and t.IDXID = 6
group by s.val,
a.val,
t.val
insert into DeletedDocuments (DocIdToDelete, DocIdToKeep)
select d.DocId as DocIdToDelete,
ud.DocId as DocIdToKeep
from DOCS d
inner join CONTENTS c on c.DOCID = d.DOCID
inner join VTAB0031 s on s.DOCID = d.DOCID
inner join VTAB0002 a on a.DOCID = d.DOCID
inner join VTAB0006 t on t.DOCID = d.DOCID
inner join UniqueDocuments ud on ud.SourceDocId = s.val
and ud.Account = a.val
and ud.TaxId = t.val
where c.FOLID = 1
and s.IDXID = 31
and a.IDXID = 2
and t.IDXID = 6
and d.DocId <> ud.DocId