如何改进此查询以删除重复项?

时间:2011-04-11 19:13:38

标签: sql-server

我正在研究文档管理系统。有些文件是从另一个系统导入的。由于错误,其中一些导入了两次。我需要删除重复项。我有来自以前系统的文档ID,但不能只删除,因为有些文档与多个帐户关联,并且应该在那里两次,所以我也要检查它。关联的值位于不同的表中。我已经创建了以下脚本来提供要删除的文档ID,但它非常慢(它已经在一个记录少于200万的表上运行了四天)。

declare @docidtodelete int
declare @docid int
declare @sourcedocid varchar(12)
declare @taxid decimal(9,0)
declare @account bigint


select @docid = MIN(d.docid) from DOCS d
inner join CONTENTS c on d.DOCID = c.DOCID and c.FOLID=1
while @docid is not null
      begin
            --get the source document id for this document
            select @sourcedocid = val from VTAB0031 where IDXID=31 and DOCID=@docid

            -- see if there is another document with the same source document id
            select @docidtodelete = isnull(MAX(v.docid),0) from VTAB0031 v
            inner join CONTENTS c on v.DOCID = c.DOCID and c.FOLID=1
            where IDXID=31 and VAL = @sourcedocid

            if @docid<@docidtodelete -- we have a possible duplicate so lets check and see if it matches on account
                  begin
                        select @account = val from VTAB0002 where IDXID=2 and DOCID=@docid
                        select @docidtodelete = isnull(max(v.docid),0) from VTAB0002 v
                              where IDXID=2 and VAL = @account and v.DOCID=@docidtodelete
                        if @docid<@docidtodelete -- we still have a possible duplicate so lets check and see if it matches on taxid
                        begin
                              select @taxid = val from VTAB0006 where IDXID=6 and DOCID=@docid
                              select @docidtodelete = isnull(max(v.docid),0) from VTAB0006 v
                                    where IDXID=6 and VAL = @taxid and v.DOCID = @docidtodelete
                              if @docid<@docidtodelete -- we still have a match so delete
                                                                begin
                                    insert into deletedDuplicates values(@docidtodelete ,@docid)
                                                                end
                        end
                  end
            select @docid = MIN(d.docid) from DOCS d
                  inner join CONTENTS c on d.DOCID = c.DOCID and c.FOLID=1
                  where d.DOCID > @docid
      end

1 个答案:

答案 0 :(得分:3)

在使用RDBMS时,最好使用set操作而不是程序操作。

请改为尝试:

select  DocIdToDelete,
        DocIdToKeep
into    deletedDuplicates
from
(
    select  max(DocId) as DocIdToDelete,
            min(DocId) as DocIdToKeep,
            SourceDocId,
            Account,
            TaxId,
            Count(*) as NumberMatches
    from
    (
        select  d.docid as DocId,
                s.val as SourceDocId,
                a.val as Account,
                t.val as TaxId
        from    DOCS d
                inner join CONTENTS c on c.DOCID = d.DOCID
                inner join VTAB0031 s on s.DOCID = d.DOCID
                inner join VTAB0002 a on a.DOCID = d.DOCID
                inner join VTAB0006 t on t.DOCID = d.DOCID
        where   c.FOLID = 1
                and s.IDXID = 31
                and a.IDXID = 2
                and t.IDXID = 6
    ) Summary
    group by    SourceDocId,
                Account,
                TaxId
    having  NumberMatches > 1
) Duplicates

<强>更新

我做了一个新查询,应该获取所有重复记录。使用索引也可以更有效地运行。

create table UniqueDocuments
(
    DocId int not null,
    SourceDocId varchar(12) not null,
    Account bigint not null,
    TaxId decimal(9,0) not null
    primary key clustered (SourceDocId, Account, TaxId)
)
go

insert into UniqueDocuments (DocId, SourceDocId, Account, TaxId)
select  min(d.docid) as DocId,
        s.val as SourceDocId,
        a.val as Account,
        t.val as TaxId
from    DOCS d
        inner join CONTENTS c on c.DOCID = d.DOCID
        inner join VTAB0031 s on s.DOCID = d.DOCID
        inner join VTAB0002 a on a.DOCID = d.DOCID
        inner join VTAB0006 t on t.DOCID = d.DOCID
where   c.FOLID = 1
        and s.IDXID = 31
        and a.IDXID = 2
        and t.IDXID = 6
group by s.val,
        a.val,
        t.val

insert into DeletedDocuments (DocIdToDelete, DocIdToKeep)
select  d.DocId as DocIdToDelete,
        ud.DocId as DocIdToKeep
from    DOCS d
        inner join CONTENTS c on c.DOCID = d.DOCID
        inner join VTAB0031 s on s.DOCID = d.DOCID
        inner join VTAB0002 a on a.DOCID = d.DOCID
        inner join VTAB0006 t on t.DOCID = d.DOCID
        inner join UniqueDocuments ud on ud.SourceDocId = s.val
                                         and ud.Account = a.val
                                         and ud.TaxId = t.val
where   c.FOLID = 1
        and s.IDXID = 31
        and a.IDXID = 2
        and t.IDXID = 6
        and d.DocId <> ud.DocId