将一列中的标识符集合合并为具有来自另一列的唯一标识符的组

时间:2012-07-16 16:13:30

标签: sql sql-server-2008 tsql uniqueidentifier

我花了更多的时间来解决这个问题,而不是承认。我有一个解决方案(下面)实现游标,但我想知道是否可以使用其他方法?对我来说,这是特别困难的,因为SQL中没有通常的构造,例如数组。

这对于使用递归似乎也是一个很好的问题,但我无法弄清楚。该平台是MSSQL 2008或T-SQL。

考虑一个包含两列非唯一标识符和一个日期列的表。对于每个日期,我想将一列(X)中的标识符合并或分组到第二列(Y)中的标识符唯一的集合中。

  • X标识符是非唯一的
  • Y标识符在每组X标识符中都是唯一的,但总体上是非唯一的
  • 合并时,使用最小可能的X标识符
  • X标识符不会超过单个日期

也许最好的起点是一些样本数据。解决方案中还有一些扩展的样本数据。在我使用它的实际实现中,通常少于200行,最常见的是少于100行。

Dt         X     Y     newX
6/1/2012   1     1     1
6/1/2012   1     2     1
6/1/2012   2     3     1
6/1/2012   3     1     3   <-- because Y=1 is already in X=1
6/1/2012   3     4     3
6/1/2012   4     5     1
6/1/2012   5     4     1   <-- Y=4 is in X=3 but not X=1
6/1/2012   5     6     1
6/1/2012   6     4     6   <-- Y=4 is in X=1 and X=3
6/1/2012   6     7     6

解决方案......

-- task: combine/condense/reassign/coalesce/collapse/consolidate sets of X identifiers into groups with unique Y identifiers, and by date
--  - X identifiers are non-unique
--  - Y identifiers are unique within each set of X identifiers, but non-unique overall
--  - When combining, the minimum possible X identifier is used
--  - An X identifier will not span more than a single date

--drop table #tmpA

CREATE TABLE #tmpA (Dt DATETIME, X INT, Y INT, newX INT, origX INT)

-- sample data
--                        Dt                              X  Y  newX
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 1, 2, 0, 1)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 1, 1, 0, 1)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 2, 5, 0, 2)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 3, 2, 0, 3)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 3, 3, 0, 3)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 4, 3, 0, 4)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 5, 5, 0, 5)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 6, 5, 0, 6)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 7, 2, 0, 7)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 7, 1, 0, 7)       -- causes a debug 4

INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 1, 2, 0, 1)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 1, 1, 0, 1)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 2, 5, 0, 2)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 3, 2, 0, 3)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 3, 3, 0, 3)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 4, 3, 0, 4)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 5, 5, 0, 5)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 6, 5, 0, 6)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 7, 7, 0, 7)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 7, 1, 0, 7)       -- causes a debug 3 if below not used
--INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 7, 6, 0, 7)     -- causes a debug 8 if above not used
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 7, 5, 0, 7)

DECLARE @X          INT
DECLARE @tX         INT     -- temporary X
DECLARE @Y          INT
DECLARE @Dt         DATETIME
DECLARE @tDt        DATETIME = CAST('1900-01-01' AS DATETIME)   -- temporary date
DECLARE @newX       INT
DECLARE @min_X      INT     -- minimum X without Y duplicate
DECLARE @min_newX   INT

DECLARE CursorA CURSOR FOR SELECT Dt, X, Y, newX FROM #tmpA
ORDER BY Dt, X
OPEN CursorA
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
SET @tX = @X        -- initialize for change in X detection

WHILE (@@FETCH_STATUS = 0)
BEGIN

-- a change in X?
IF (@tX != @X)
BEGIN
    -- change in X, update all prior X to their newX (which should all be the same)
    UPDATE #tmpA SET X = newX WHERE Dt = @tDt AND X = @tX
select 1 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
    SET @tX = @X
END

IF (@newX != 0)
BEGIN
    -- newX for this X and Y already assigned, move on
    FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
    CONTINUE
END

IF (@Dt != @tDt)
BEGIN
    -- date change
    SET @tDt = @Dt

    -- all for this first X are simply the same identifier
    UPDATE #tmpA SET newX = @X WHERE Dt = @Dt AND X = @X
select 2 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
    FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
    CONTINUE
END

-- still on same date
-- is there any duplicate Y already assigned a newX?
SELECT @min_X = MIN(X) FROM #tmpA
    WHERE Dt = @Dt AND X != @X AND Y = @Y AND newX != 0

IF @min_X IS NOT NULL
BEGIN

    -- there is a Y duplicate within this date
    -- find the earliest X which does not have a duplicate Y
    SELECT @min_newX = MIN(X) FROM #tmpA
        WHERE Dt = @Dt AND X != @X AND Y != @Y AND newX != 0
        AND X NOT IN (SELECT X FROM #tmpA
            WHERE Dt = @Dt AND X != @X AND Y = @Y AND newX != 0)

    IF @min_newX IS NOT NULL
    BEGIN

        -- is there an "earlier" X already assigned a newX?
        SELECT @min_X = MIN(newX) FROM #tmpA
            WHERE Dt = @Dt AND X = @X AND newX !=0

        IF @min_X IS NOT NULL
        BEGIN

            -- there is another X already assigned a newX
            IF @min_newX >= @min_X
            BEGIN
                -- set the other one to this one
                UPDATE #tmpA SET newX = @min_newX
                    WHERE Dt = @Dt AND X = @X AND newX = @min_X
                UPDATE #tmpA SET newX = @min_newX
                    WHERE Dt = @Dt AND X = @X AND Y = @Y
select 3 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
            END
            ELSE
            BEGIN
                UPDATE #tmpA SET newX = @min_X
                    WHERE Dt = @Dt AND X = @X AND Y = @Y
select 4 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
            END

            FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
            CONTINUE
        END

        -- 
        UPDATE #tmpA SET newX = @min_newX
            WHERE Dt = @Dt AND X = @X AND Y = @Y
select 5 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
        FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
        CONTINUE

    END

    -- no other X without a duplicate Y already assigned a newX so assign this entire X set to itself
    UPDATE #tmpA SET newX = @X WHERE Dt = @Dt AND X = @X
select 6 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
    FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
    CONTINUE
END

-- no other Y but it's possible that another newX for this X is set to something different
SELECT @min_newX = MIN(newX) FROM #tmpA
    WHERE Dt = @Dt AND X = @X AND Y != @Y AND newX != 0

-- also find the min X for this Y
SELECT @min_X = MIN(X) FROM #tmpA
    WHERE Dt = @Dt AND X != @X AND Y != @Y AND newX != 0

IF @min_newX IS NULL
BEGIN
    -- no other Y for this X is assigned so set it to the minimum X already found
    UPDATE #tmpA SET newX = @min_X
        WHERE Dt = @Dt AND X = @X AND Y = @Y
select 7 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
    FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
    CONTINUE
END

-- there is another of the same X with a newX
IF (@min_X = @min_newX OR @min_X > @min_newX OR @min_newX IS NULL)
BEGIN
    -- there is a different Y for this X which has already been assigned the same newX as this one should be
    -- or a later one was found
    UPDATE #tmpA SET newX = @min_X
        WHERE Dt = @Dt AND X = @X AND Y = @Y
select 8 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
    FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
    CONTINUE
END

UPDATE #tmpA SET newX = @min_newX
    WHERE Dt = @Dt AND X = @X AND Y = @Y
select 9 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX

END

-- gotta catch the last set
UPDATE #tmpA SET X = newX WHERE Dt = @Dt AND X = @tX

SELECT * FROM #tmpA
--  ORDER BY Dt, X, Y
CLOSE CursorA
DEALLOCATE CursorA
RETURN

这是输出应该是什么样的......

Dt                      X   Y   newX    origX
2012-06-01 00:00:00.000 1   2   1   1
2012-06-01 00:00:00.000 1   1   1   1
2012-06-01 00:00:00.000 1   5   1   2
2012-06-01 00:00:00.000 3   2   3   3
2012-06-01 00:00:00.000 3   3   3   3
2012-06-01 00:00:00.000 1   3   1   4
2012-06-01 00:00:00.000 3   5   3   5
2012-06-01 00:00:00.000 6   5   6   6
2012-06-01 00:00:00.000 6   2   6   7
2012-06-01 00:00:00.000 6   1   6   7
2012-06-02 00:00:00.000 1   2   1   1
2012-06-02 00:00:00.000 1   1   1   1
2012-06-02 00:00:00.000 1   5   1   2
2012-06-02 00:00:00.000 3   2   3   3
2012-06-02 00:00:00.000 3   3   3   3
2012-06-02 00:00:00.000 1   3   1   4
2012-06-02 00:00:00.000 3   5   3   5
2012-06-02 00:00:00.000 6   5   6   6
2012-06-02 00:00:00.000 7   7   7   7
2012-06-02 00:00:00.000 7   1   7   7
2012-06-02 00:00:00.000 7   5   7   7

1 个答案:

答案 0 :(得分:0)

你想要的是这样的。插入临时表后将其粘贴到

select * from 
(
select X, Y, origX, ROW_NUMBER() OVER(PARTITION BY X ORDER BY x) newX
from #tmpA
)
results