我有一个存储相关行组的表,不同的行通过groupIdentifier列相关。组的大小可以是任意数量的行。
我需要能够传入一组新的行组,然后找到新的匹配组的映射。复杂的是,组中每行的顺序由rowOrdinal值定义,必须考虑在内。 rowOrdinal值并不总是基于0,但组中的行按该值排序。此外@existingData包含数以千计的潜在组,因此查询需要具有高效性
以下是输入数据集示例:
declare @existingData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @existingData values
(100, 0, 'X'),
(100, 1, 'Y'),
(200, 0, 'A'),
(200, 1, 'B'),
(200, 2, 'C'),
(40, 0, 'X'),
(41, 0, 'Y')
declare @newData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @newData values
(1, 55, 'X'),
(1, 59, 'Y'),
(2, 0, 'Y'),
(2, 1, 'X')
-- @newData group 1 matches to @existingData group 100, @newData group 2 has no match in existingData
所需的结果是包含两列的结果集,existingGroupIdentifier和newGroupIdentifier。在这种情况下,唯一的结果行是100,1。100是@existingData groupIdentifier,1是@newData groupIdentifier
修改 以下是我到目前为止所提出的,假设我的最大组大小为N,我可以手动复制使用pivot和temp表的粘贴tsql代码来对每个组大小进行比较。但是,这会将系统限制为N,看起来很难看,如果可能的话,我更愿意在单个查询中执行此操作
declare @existingData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @existingData values
(100, 0, 'X'),
(100, 1, 'Y'),
(200, 0, 'A'),
(200, 1, 'B'),
(200, 2, 'C'),
(40, 0, 'X'),
(41, 0, 'Y')
declare @newData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @newData values
(1, 55, 'X'),
(1, 59, 'Y'),
(2, 0, 'Y'),
(2, 1, 'X'),
(3, 99, 'Y'),
(5, 4, 'A'),
(5, 10, 'B'),
(5, 200, 'C')
-- First build table of the size of each group, limiting @existingData to only potentially matching groups (have at least one member in common)
declare @potentialGroupsInExistingData table (groupIdentifier int, groupSize int)
insert into @potentialGroupsInExistingData
select
ExistingData.groupIdentifier, COUNT(ExistingData.groupIdentifier)
from
@existingData ExistingData
where
exists (select top 1 * from @newData where value = ExistingData.value)
group by ExistingData.groupIdentifier
declare @groupsInNewData table (groupIdentifier int, groupSize int)
insert into @groupsInNewData
select
NewData.groupIdentifier, COUNT(NewData.groupIdentifier)
from
@newData NewData
group by NewData.groupIdentifier
-- handle groups of size one, this is a simpler case of the pivoting used with more than size 1 groups
-----------------------------------
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
@potentialGroupsInExistingData PotentialExistingGroup
cross join @groupsInNewData GroupsInNewData
inner join @existingData ExistingData on
ExistingData.groupIdentifier = PotentialExistingGroup.groupIdentifier
inner join @newData NewData on
NewData.groupIdentifier = GroupsInNewData.groupIdentifier
and NewData.value = ExistingData.value
where
PotentialExistingGroup.groupSize = 1
and GroupsInNewData.groupSize = 1
-- handle groups of size two
-----------------------------------
declare @existingGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))
insert into @existingGroupsOfSizeTwo
select
*
from
(select
ExistingData.groupIdentifier,
ExistingData.value,
ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
from
@potentialGroupsInExistingData PotentialGroup
inner join @existingData ExistingData on
ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
where
PotentialGroup.groupSize = 2) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p
declare @newGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))
insert into @newGroupsOfSizeTwo
select
*
from
(select
NewData.groupIdentifier,
NewData.value,
ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
from
@groupsInNewData NewDataGroup
inner join @newData NewData on
NewData.groupIdentifier = NewDataGroup.groupIdentifier
where
NewDataGroup.groupSize = 2) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
@newGroupsOfSizeTwo NewData
inner join @existingGroupsOfSizeTwo ExistingData on
ExistingData.valueOne = NewData.valueOne
and ExistingData.valueTwo = NewData.valueTwo
-- handle groups of size three
-----------------------------------
declare @existingGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))
insert into @existingGroupsOfSizeThree
select
*
from
(select
ExistingData.groupIdentifier,
ExistingData.value,
ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
from
@potentialGroupsInExistingData PotentialGroup
inner join @existingData ExistingData on
ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
where
PotentialGroup.groupSize = 3) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p
declare @newGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))
insert into @newGroupsOfSizeThree
select
*
from
(select
NewData.groupIdentifier,
NewData.value,
ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
from
@groupsInNewData NewDataGroup
inner join @newData NewData on
NewData.groupIdentifier = NewDataGroup.groupIdentifier
where
NewDataGroup.groupSize = 3) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
@newGroupsOfSizeThree NewData
inner join @existingGroupsOfSizeThree ExistingData on
ExistingData.valueOne = NewData.valueOne
and ExistingData.valueTwo = NewData.valueTwo
and ExistingData.valueThree = NewData.valueThree
答案 0 :(得分:2)
一般想法
给定的表可以有多个行用于相同的组ID。 如果我们有一种方法来收集给定的表,使得每个组ID都有一行加上一列中所有组的值,那么找到所有匹配的组就变得微不足道了。
如果我们进行了这次改造
@existingData
->
@ExistingDataGrouped (ID, DataValues)
@newData
->
@NewDataGrouped (ID, DataValues)
然后最终查询将如下所示(请注意,我们正在加入DataValues
,而不是ID
):
SELECT
E.ID, N.ID
FROM
@ExistingDataGrouped AS E
INNER JOIN @NewDataGrouped AS N ON N.DataValues = E.DataValues
如何制作grouped
表格
XML
(为SQL Server搜索“group_concat”,例如How to make a query with group_concat in sql server)GroupConcat
函数的CLR实现来指定顺序。我个人使用http://groupconcat.codeplex.com/,这可能是一个好的开始。一些优化
如果源行数量很大,可以使用CHECKSUM_AGG
进行一些初步过滤。
WITH
CTE_ExistingRN
AS
(
SELECT
GroupIdentifier
,ROW_NUMBER() OVER(PARTITION BY GroupIdentifier ORDER BY RowOrdinal) AS rn
,Value
FROM @ExistingData
)
,CTE_NewRN
AS
(
SELECT
GroupIdentifier
,ROW_NUMBER() OVER(PARTITION BY GroupIdentifier ORDER BY RowOrdinal) AS rn
,Value
FROM @NewData
)
,CTE_ExistingAgg
AS
(
SELECT
GroupIdentifier
, CHECKSUM_AGG(CHECKSUM(rn, Value)) AS DataValues
FROM CTE_ExistingRN
GROUP BY GroupIdentifier
)
,CTE_NewAgg
AS
(
SELECT
GroupIdentifier
, CHECKSUM_AGG(CHECKSUM(rn, Value)) AS DataValues
FROM CTE_NewRN
GROUP BY GroupIdentifier
)
SELECT
CTE_ExistingAgg.GroupIdentifier AS ExistingGroupIdentifier
, CTE_NewAgg.GroupIdentifier AS NewGroupIdentifier
FROM
CTE_ExistingAgg
INNER JOIN CTE_NewAgg ON CTE_NewAgg.DataValues = CTE_ExistingAgg.DataValues
;
首先,我们重新编号所有行,以便每个组从1开始(CTE_ExistingRN
和CTE_NewRN
)。
CHECKSUM(rn, Value)
为每个源行返回一些整数,并考虑行号及其值。不同的值通常会产生不同的校验和。
CHECKSUM_AGG
将所有校验和组合在一起。
结果集:
ExistingGroupIdentifier NewGroupIdentifier
100 1
100 2
此结果将包含完全匹配的所有组(100, 1
),并且它还可以包含一些不匹配的组,但是它们的校验和碰巧是相同的( 100, 2
)。这就是为什么这一步是初步的。要获得准确的结果,您应该比较实际值,而不是校验和。但是这一步可能会过滤掉大量绝对不匹配的群体。
使用XML的解决方案
此解决方案将每个组的值转换为XML,并提供准确的结果。我之前从未使用过FOR XML
,并且很想知道它是如何工作的。
WITH
CTE_ExistingGroups
AS
(
SELECT DISTINCT GroupIdentifier
FROM @ExistingData
)
,CTE_NewGroups
AS
(
SELECT DISTINCT GroupIdentifier
FROM @NewData
)
,CTE_ExistingAgg
AS
(
SELECT
GroupIdentifier
,CA_Data.XML_Value AS DataValues
FROM
CTE_ExistingGroups
CROSS APPLY
(
SELECT Value+','
FROM @ExistingData
WHERE GroupIdentifier = CTE_ExistingGroups.GroupIdentifier
ORDER BY RowOrdinal FOR XML PATH(''), TYPE
) AS CA_XML(XML_Value)
CROSS APPLY
(
SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
) AS CA_Data(XML_Value)
)
,CTE_NewAgg
AS
(
SELECT
GroupIdentifier
,CA_Data.XML_Value AS DataValues
FROM
CTE_NewGroups
CROSS APPLY
(
SELECT Value+','
FROM @NewData
WHERE GroupIdentifier = CTE_NewGroups.GroupIdentifier
ORDER BY RowOrdinal FOR XML PATH(''), TYPE
) AS CA_XML(XML_Value)
CROSS APPLY
(
SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
) AS CA_Data(XML_Value)
)
SELECT
CTE_ExistingAgg.GroupIdentifier AS ExistingGroupIdentifier
, CTE_NewAgg.GroupIdentifier AS NewGroupIdentifier
FROM
CTE_ExistingAgg
INNER JOIN CTE_NewAgg ON CTE_NewAgg.DataValues = CTE_ExistingAgg.DataValues
;
结果集:
ExistingGroupIdentifier NewGroupIdentifier
100 1
答案 1 :(得分:0)
试试这个:
declare @existingData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @existingData values
(100, 0, 'X'),
(100, 1, 'Y'),
(200, 0, 'A'),
(200, 1, 'B'),
(200, 2, 'C'),
(40, 0, 'X'),
(41, 0, 'Y')
declare @newData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @newData values
(1, 55, 'X'),
(1, 59, 'Y'),
(2, 0, 'Y'),
(2, 1, 'X')
declare @results table (
existingGID int,
newGID int)
DECLARE @existingGroupID int
DECLARE outer_cursor CURSOR FOR
SELECT DISTINCT groupIdentifier FROM @existingData
OPEN outer_cursor
FETCH NEXT FROM outer_cursor INTO @existingGroupID
WHILE @@FETCH_STATUS = 0
BEGIN
DECLARE @existingGroupCount int
SELECT @existingGroupCount = COUNT(value) FROM @existingData WHERE groupIdentifier = @existingGroupID
DECLARE @newGroupID int
DECLARE inner_cursor CURSOR FOR
SELECT DISTINCT groupIdentifier from @newData
OPEN inner_cursor
FETCH NEXT FROM inner_cursor INTO @newGroupID
WHILE @@FETCH_STATUS = 0
BEGIN
DECLARE @newGroupCount int
SELECT @newGroupCount = COUNT(value) FROM @newData WHERE groupIdentifier = @newGroupID
-- if groups are different sizes, skip
IF @newGroupCount = @existingGroupCount
BEGIN
DECLARE @newStart int = -1
DECLARE @currentValue varchar(1)
DECLARE @validGroup bit = 1
DECLARE equality_cursor CURSOR FOR
SELECT value FROM @existingData WHERE groupIdentifier = @existingGroupID ORDER BY rowOrdinal
OPEN equality_cursor
FETCH NEXT FROM equality_cursor INTO @currentValue
WHILE @@FETCH_STATUS = 0
BEGIN
DECLARE @newValue varchar(1)
SELECT TOP 1 @newValue = value, @newStart = rowOrdinal FROM @newData WHERE groupIdentifier = @newGroupID AND @newStart < rowOrdinal ORDER BY rowOrdinal
IF(@newValue <> @currentValue)
BEGIN
SET @validGroup = 0
BREAK
END
FETCH NEXT FROM equality_cursor INTO @currentValue
END
CLOSE equality_cursor
DEALLOCATE equality_cursor
IF @validGroup = 1
BEGIN
INSERT INTO @results (existingGID, newGID) VALUES (@existingGroupID, @newGroupID)
END
END
FETCH NEXT FROM inner_cursor INTO @newGroupID
END
CLOSE inner_cursor
DEALLOCATE inner_cursor
FETCH NEXT FROM outer_cursor INTO @existingGroupID
END
CLOSE outer_cursor
DEALLOCATE outer_cursor
SELECT * FROM @results
我需要开始,但我稍后会用更好的评论来编辑它,以解释代码的作用。