如何在SQL中匹配记录的行组?

时间:2015-02-12 22:39:25

标签: sql sql-server tsql

我有一个存储相关行组的表,不同的行通过groupIdentifier列相关。组的大小可以是任意数量的行。

我需要能够传入一组新的行组,然后找到新的匹配组的映射。复杂的是,组中每行的顺序由rowOrdinal值定义,必须考虑在内。 rowOrdinal值并不总是基于0,但组中的行按该值排序。此外@existingData包含数以千计的潜在组,因此查询需要具有高效性

以下是输入数据集示例:

declare @existingData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @existingData values 
    (100, 0, 'X'),
    (100, 1, 'Y'),

    (200, 0, 'A'),
    (200, 1, 'B'),
    (200, 2, 'C'),

    (40, 0, 'X'),

    (41, 0, 'Y')


declare @newData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @newData values 
    (1, 55, 'X'),
    (1, 59, 'Y'),

    (2, 0, 'Y'),
    (2, 1, 'X')

-- @newData group 1 matches to @existingData group 100, @newData group 2 has no match in existingData

所需的结果是包含两列的结果集,existingGroupIdentifier和newGroupIdentifier。在这种情况下,唯一的结果行是100,1。100是@existingData groupIdentifier,1是@newData groupIdentifier

修改 以下是我到目前为止所提出的,假设我的最大组大小为N,我可以手动复制使用pivot和temp表的粘贴tsql代码来对每个组大小进行比较。但是,这会将系统限制为N,看起来很难看,如果可能的话,我更愿意在单个查询中执行此操作

declare @existingData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @existingData values 
    (100, 0, 'X'),
    (100, 1, 'Y'),

    (200, 0, 'A'),
    (200, 1, 'B'),
    (200, 2, 'C'),

    (40, 0, 'X'),

    (41, 0, 'Y')


declare @newData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @newData values 
    (1, 55, 'X'),
    (1, 59, 'Y'),

    (2, 0, 'Y'),
    (2, 1, 'X'),

    (3, 99, 'Y'),

    (5, 4, 'A'),
    (5, 10, 'B'),
    (5, 200, 'C')


-- First build table of the size of each group, limiting @existingData to only potentially matching groups (have at least one member in common)
declare @potentialGroupsInExistingData table (groupIdentifier int, groupSize int)

insert into @potentialGroupsInExistingData
    select
        ExistingData.groupIdentifier, COUNT(ExistingData.groupIdentifier)
    from
        @existingData ExistingData
    where
        exists (select top 1 * from @newData where value = ExistingData.value)
    group by ExistingData.groupIdentifier

declare @groupsInNewData table (groupIdentifier int, groupSize int)

insert into @groupsInNewData
    select
        NewData.groupIdentifier, COUNT(NewData.groupIdentifier)
    from
        @newData NewData
    group by NewData.groupIdentifier


-- handle groups of size one, this is a simpler case of the pivoting used with more than size 1 groups
-----------------------------------
select
    ExistingData.groupIdentifier as ExistingGroupIdentifier,
    NewData.groupIdentifier as NewGroupIdentifier
from
    @potentialGroupsInExistingData PotentialExistingGroup
    cross join @groupsInNewData GroupsInNewData
    inner join @existingData ExistingData on
        ExistingData.groupIdentifier = PotentialExistingGroup.groupIdentifier
    inner join @newData NewData on
        NewData.groupIdentifier = GroupsInNewData.groupIdentifier
        and NewData.value = ExistingData.value
where
    PotentialExistingGroup.groupSize = 1
    and GroupsInNewData.groupSize = 1


-- handle groups of size two
-----------------------------------
declare @existingGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))

insert into @existingGroupsOfSizeTwo 
    select
        *
    from
        (select
            ExistingData.groupIdentifier,
            ExistingData.value,
            ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
        from
            @potentialGroupsInExistingData PotentialGroup
            inner join @existingData ExistingData on
                ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
        where
            PotentialGroup.groupSize = 2) as T
    pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p

declare @newGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))

insert into @newGroupsOfSizeTwo
    select
        *
    from
        (select
            NewData.groupIdentifier,
            NewData.value,
            ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
        from
            @groupsInNewData NewDataGroup
            inner join @newData NewData on
                NewData.groupIdentifier = NewDataGroup.groupIdentifier
        where
            NewDataGroup.groupSize = 2) as T
    pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p

select
    ExistingData.groupIdentifier as ExistingGroupIdentifier,
    NewData.groupIdentifier as NewGroupIdentifier
from
    @newGroupsOfSizeTwo NewData
    inner join @existingGroupsOfSizeTwo ExistingData on
        ExistingData.valueOne = NewData.valueOne
        and ExistingData.valueTwo = NewData.valueTwo


-- handle groups of size three
-----------------------------------
declare @existingGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))

insert into @existingGroupsOfSizeThree 
    select
        *
    from
        (select
            ExistingData.groupIdentifier,
            ExistingData.value,
            ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
        from
            @potentialGroupsInExistingData PotentialGroup
            inner join @existingData ExistingData on
                ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
        where
            PotentialGroup.groupSize = 3) as T
    pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p

declare @newGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))

insert into @newGroupsOfSizeThree
    select
        *
    from
        (select
            NewData.groupIdentifier,
            NewData.value,
            ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
        from
            @groupsInNewData NewDataGroup
            inner join @newData NewData on
                NewData.groupIdentifier = NewDataGroup.groupIdentifier
        where
            NewDataGroup.groupSize = 3) as T
    pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p

select
    ExistingData.groupIdentifier as ExistingGroupIdentifier,
    NewData.groupIdentifier as NewGroupIdentifier
from
    @newGroupsOfSizeThree NewData
    inner join @existingGroupsOfSizeThree ExistingData on
        ExistingData.valueOne = NewData.valueOne
        and ExistingData.valueTwo = NewData.valueTwo
        and ExistingData.valueThree = NewData.valueThree

2 个答案:

答案 0 :(得分:2)

一般想法

给定的表可以有多个行用于相同的组ID。 如果我们有一种方法来收集给定的表,使得每个组ID都有一行加上一列中所有组的值,那么找到所有匹配的组就变得微不足道了。

如果我们进行了这次改造

@existingData -> @ExistingDataGrouped (ID, DataValues)

@newData -> @NewDataGrouped (ID, DataValues)

然后最终查询将如下所示(请注意,我们正在加入DataValues,而不是ID):

SELECT
    E.ID, N.ID
FROM
    @ExistingDataGrouped AS E
    INNER JOIN @NewDataGrouped AS N ON N.DataValues = E.DataValues

如何制作grouped表格

一些优化

如果源行数量很大,可以使用CHECKSUM_AGG进行一些初步过滤。

WITH
CTE_ExistingRN
AS
(
    SELECT
        GroupIdentifier
        ,ROW_NUMBER() OVER(PARTITION BY GroupIdentifier ORDER BY RowOrdinal) AS rn
        ,Value
    FROM @ExistingData
)
,CTE_NewRN
AS
(
    SELECT
        GroupIdentifier
        ,ROW_NUMBER() OVER(PARTITION BY GroupIdentifier ORDER BY RowOrdinal) AS rn
        ,Value
    FROM @NewData
)
,CTE_ExistingAgg
AS
(
    SELECT
        GroupIdentifier
        , CHECKSUM_AGG(CHECKSUM(rn, Value)) AS DataValues
    FROM CTE_ExistingRN
    GROUP BY GroupIdentifier
)
,CTE_NewAgg
AS
(
    SELECT
        GroupIdentifier
        , CHECKSUM_AGG(CHECKSUM(rn, Value)) AS DataValues
    FROM CTE_NewRN
    GROUP BY GroupIdentifier
)
SELECT
    CTE_ExistingAgg.GroupIdentifier AS ExistingGroupIdentifier
    , CTE_NewAgg.GroupIdentifier AS NewGroupIdentifier
FROM
    CTE_ExistingAgg
    INNER JOIN CTE_NewAgg ON CTE_NewAgg.DataValues = CTE_ExistingAgg.DataValues
;

首先,我们重新编号所有行,以便每个组从1开始(CTE_ExistingRNCTE_NewRN)。

CHECKSUM(rn, Value)为每个源行返回一些整数,并考虑行号及其值。不同的值通常会产生不同的校验和。

CHECKSUM_AGG将所有校验和组合在一起。

结果集:

ExistingGroupIdentifier    NewGroupIdentifier
100                        1
100                        2

此结果将包含完全匹配的所有组(100, 1),并且它还可以包含一些不匹配的组,但是它们的校验和碰巧是相同的( 100, 2)。这就是为什么这一步是初步的。要获得准确的结果,您应该比较实际值,而不是校验和。但是这一步可能会过滤掉大量绝对不匹配的群体。

使用XML的解决方案

此解决方案将每个组的值转换为XML,并提供准确的结果。我之前从未使用过FOR XML,并且很想知道它是如何工作的。

WITH
CTE_ExistingGroups
AS
(
    SELECT DISTINCT GroupIdentifier
    FROM @ExistingData
)
,CTE_NewGroups
AS
(
    SELECT DISTINCT GroupIdentifier
    FROM @NewData
)
,CTE_ExistingAgg
AS
(
    SELECT
        GroupIdentifier
        ,CA_Data.XML_Value AS DataValues
    FROM
        CTE_ExistingGroups
        CROSS APPLY
        (
            SELECT Value+','
            FROM @ExistingData
            WHERE GroupIdentifier = CTE_ExistingGroups.GroupIdentifier
            ORDER BY RowOrdinal FOR XML PATH(''), TYPE
        ) AS CA_XML(XML_Value)
        CROSS APPLY
        (
            SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
        ) AS CA_Data(XML_Value)
)
,CTE_NewAgg
AS
(
    SELECT
        GroupIdentifier
        ,CA_Data.XML_Value AS DataValues
    FROM
        CTE_NewGroups
        CROSS APPLY
        (
            SELECT Value+','
            FROM @NewData
            WHERE GroupIdentifier = CTE_NewGroups.GroupIdentifier
            ORDER BY RowOrdinal FOR XML PATH(''), TYPE
        ) AS CA_XML(XML_Value)
        CROSS APPLY
        (
            SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
        ) AS CA_Data(XML_Value)
)
SELECT
    CTE_ExistingAgg.GroupIdentifier AS ExistingGroupIdentifier
    , CTE_NewAgg.GroupIdentifier AS NewGroupIdentifier
FROM
    CTE_ExistingAgg
    INNER JOIN CTE_NewAgg ON CTE_NewAgg.DataValues = CTE_ExistingAgg.DataValues
;

结果集:

ExistingGroupIdentifier    NewGroupIdentifier
100                        1

答案 1 :(得分:0)

试试这个:

declare @existingData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @existingData values 
    (100, 0, 'X'),
    (100, 1, 'Y'),

    (200, 0, 'A'),
    (200, 1, 'B'),
    (200, 2, 'C'),

    (40, 0, 'X'),

    (41, 0, 'Y')


declare @newData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @newData values 
    (1, 55, 'X'),
    (1, 59, 'Y'),

    (2, 0, 'Y'),
    (2, 1, 'X')

declare @results table (
    existingGID int,
    newGID int)

DECLARE @existingGroupID int
DECLARE outer_cursor CURSOR FOR
SELECT DISTINCT groupIdentifier FROM @existingData
OPEN outer_cursor
FETCH NEXT FROM outer_cursor INTO @existingGroupID
WHILE @@FETCH_STATUS = 0
BEGIN
    DECLARE @existingGroupCount int
    SELECT @existingGroupCount = COUNT(value) FROM @existingData WHERE groupIdentifier = @existingGroupID
    DECLARE @newGroupID int
    DECLARE inner_cursor CURSOR FOR
    SELECT DISTINCT groupIdentifier from @newData
    OPEN inner_cursor
    FETCH NEXT FROM inner_cursor INTO @newGroupID
    WHILE @@FETCH_STATUS = 0
    BEGIN
        DECLARE @newGroupCount int
        SELECT @newGroupCount = COUNT(value) FROM @newData WHERE groupIdentifier = @newGroupID
        -- if groups are different sizes, skip
        IF @newGroupCount = @existingGroupCount
        BEGIN
            DECLARE @newStart int = -1
            DECLARE @currentValue varchar(1)
            DECLARE @validGroup bit = 1
            DECLARE equality_cursor CURSOR FOR
            SELECT value FROM @existingData WHERE groupIdentifier = @existingGroupID ORDER BY rowOrdinal
            OPEN equality_cursor
            FETCH NEXT FROM equality_cursor INTO @currentValue
            WHILE @@FETCH_STATUS = 0
            BEGIN
                DECLARE @newValue varchar(1)
                SELECT TOP 1 @newValue = value, @newStart = rowOrdinal FROM @newData WHERE groupIdentifier = @newGroupID AND @newStart < rowOrdinal ORDER BY rowOrdinal
                IF(@newValue <> @currentValue)
                BEGIN
                    SET @validGroup = 0
                    BREAK
                END
                FETCH NEXT FROM equality_cursor INTO @currentValue
            END
            CLOSE equality_cursor
            DEALLOCATE equality_cursor
            IF @validGroup = 1
            BEGIN
                INSERT INTO @results (existingGID, newGID) VALUES (@existingGroupID, @newGroupID)
            END
        END
        FETCH NEXT FROM inner_cursor INTO @newGroupID
    END
    CLOSE inner_cursor
    DEALLOCATE inner_cursor
    FETCH NEXT FROM outer_cursor INTO @existingGroupID
END
CLOSE outer_cursor
DEALLOCATE outer_cursor

SELECT * FROM @results

我需要开始,但我稍后会用更好的评论来编辑它,以解释代码的作用。