如何有效地将分组记录拆分为批次

时间:2017-07-19 15:23:20

标签: sql tsql

对于表中的每个组,我需要将该组拆分为特定数量的记录(批次),并使用相应的批次ID批量标记每个记录。

现在,我基于游标的实现是恕我直言,笨拙。拆分10 000行需要1分钟,不用说,非常慢。任何线索如何更快地完成这项工作?

这是测试脚本。

 -- Needed to generate big data
 DECLARE @Naturals TABLE (ID INT)
 INSERT INTO @Naturals (ID)
 VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10)

 DECLARE @TestData TABLE
 (
    LINK INT,
    F_House INT,
    F_Batch UNIQUEIDENTIFIER
 )

 INSERT INTO @TestData (LINK, F_House)
 SELECT ROW_NUMBER() OVER (order by T1.ID), ROW_NUMBER() OVER (order by T1.ID) % 5 
 FROM 
 @Naturals T1
 CROSS JOIN @Naturals T2
 CROSS JOIN @Naturals T3
 CROSS JOIN @Naturals T4
 --CROSS JOIN @Naturals T5 -- that would give us 100 000

 -- Finished preparing Data (10 000 rows)
 SELECT 'Processing:', COUNT(*) FROM @TestData

 DECLARE @batchSize INT -- That would be amount of rows in each batch
 SET @batchSize = 50

 IF OBJECT_ID('tempdb..#G') IS NOT NULL -- Split set of data into groups. We need to create batches in each group.
    DROP TABLE #G

 SELECT 
    buf.F_House, COUNT(*) AS GroupCount
 INTO #G
 FROM @TestData buf
 GROUP BY buf.F_House -- That logic could be tricky one. Right now simplifying

 DECLARE @F_House INT -- That would be group key

 DECLARE db_cursor CURSOR FOR
 SELECT F_House
 FROM #G
 ORDER BY F_House

 OPEN db_cursor FETCH NEXT FROM db_cursor INTO @F_House

 WHILE @@FETCH_STATUS = 0   
 BEGIN 
    PRINT 'Processing house group: ' + CAST(@F_House AS VARCHAR(10))

    -- For each group let's create batches
    WHILE EXISTS (SELECT 1 FROM @TestData AS itmds 
                  WHERE itmds.F_House = @F_House 
                  AND itmds.F_Batch IS NULL
                )
    BEGIN
        DECLARE @batchLink UNIQUEIDENTIFIER
        SET @batchLink = NEWID()

        UPDATE itmds
        SET itmds.F_Batch = @batchLink
        FROM @TestData AS itmds
        WHERE itmds.F_House = @F_House 
                  AND itmds.F_Batch IS NULL
                  AND itmds.LINK IN 
                  (
                        SELECT TOP (@batchSize)
                        sub.LINK
                        FROM @TestData sub
                        WHERE sub.F_House = @F_House
                        AND sub.F_Batch IS NULL
                  )

    END

    FETCH NEXT FROM db_cursor INTO @F_House
 END 

 CLOSE db_cursor   
 DEALLOCATE db_cursor

 SELECT
    buf.F_House, COUNT(distinct F_Batch) AS BatchCountInHouse
 FROM @TestData buf
 GROUP BY buf.F_House
 ORDER BY buf.F_House

预期输出(考虑batchsize = 50)

10 000行/ 5间房屋= 2000行/房屋

2000行/ house / 50(batchSize)= 40批/房

enter image description here

3 个答案:

答案 0 :(得分:2)

我会在循环内部使用内部循环来引用分组级别。  然后,您可以从分组迭代到BatchGrouping。但是,正如您所指出的那样,速度是表变量和CTE的问题,因此我在这种情况下使用tempdb#table进行了测试。这样我就可以在插入后进行索引并优化性能。我可以在大约16秒内运行一百万行聚合逻辑。我认为这是可接受的表现。但我的开发盒是一台I7 6700,有16台演出的DDR4和一块SSD。性能时间可能因硬件而异。

--Make up some fake data for example
DECLARE 
  @Start INT = 1
, @End INT = 100000
;

SET NOCOUNT ON;
IF OBJECT_ID('tempdb..#Temp') IS NOT NULL 
    DROP TABLE tempdb..#Temp

CREATE Table #Temp (Id INT, Grp int, Val VARCHAR(8), BatchGroup int)

WHILE @Start <= @End
BEGIN
    INSERT INTO #Temp (Id, Grp, Val) 
    VALUES (@Start, CAST(RAND() * 8 AS INT) + 1, LEFT(NEWID(), 8))

    SELECT @Start += 1;  
END

CREATE CLUSTERED INDEX IX_Temp_Grp ON #Temp(Grp, BatchGroup)

--Determine Batch Size You want for groupings
DECLARE @BatchSize INT = 1000;

--Let's randomly mess with groupings
DECLARE @X INT = 1
WHILE @X <= 4
BEGIN
    ; WITH x AS 
      (
      SELECT TOP (@BatchSize * 4) 
        Id
      , Grp
      , Val
      FROM #Temp
      WHERE Grp = CAST(RAND() * 8 AS INT) + 1
      )
    UPDATE x
    SET Grp = CAST(RAND() * 8 AS INT) + 1

    SELECT @X += 1
END

DECLARE 
  @CurrentGroup INT = 1
, @CurrentBatch INT = 1

WHILE @CurrentGroup <= (SELECT MAX(Grp) FROM #Temp) -- Exists (SELECT 1 FROM @Temp WHERE BatchGroup IS NULL)
BEGIN
    WHILE EXISTS (SELECT 1 FROM #Temp WHERE Grp = @CurrentGroup AND BatchGroup IS NULL)
    BEGIN
        ; WITH x AS 
        (
        SELECT TOP (@BatchSize) *
        FROM #Temp
        WHERE Grp = @CurrentGroup 
          AND BatchGroup IS NULL
        )
        update x
        SET BatchGroup = @CurrentBatch

        SELECT @CurrentBatch += 1;
    END

    SELECT @CurrentBatch = 1
    SELECT @CurrentGroup += 1;
END

--Proof
Select 
  Grp
, COUNT(DISTINCT Id)
, COUNT(DISTINCT BatchGroup)
From #Temp
GROUP BY Grp

答案 1 :(得分:1)

这是基于避免光标设置的。分配的F_Batch是BIGINT:

;with baseRowNum as 
 (
   SELECT LINK, F_House,
      -- row number per F_House
      Row_Number() Over (PARTITION  BY F_House ORDER BY LINK) AS rn
   FROM @TestData
 )
SELECT *,
   -- combine F_House & group number into a unique result
   F_House * 10000 +
   -- start a new sub group for every F_House or after @batchSize rows
   Sum(CASE WHEN rn % @batchSize = 1 THEN 1 ELSE 0 END)
   Over (ORDER BY F_House, rn 
         ROWS Unbounded Preceding) AS F_Batch
FROM baseRowNum

如果您真的需要UNIQUEINDENTIFIER,可以加入:

;with baseRowNums as 
 (
   SELECT LINK, F_House,
      -- row number per F_House
      Row_Number() Over (PARTITION  BY F_House ORDER BY LINK) AS rn
   FROM @TestData
 )
,batchNums as
 (
   SELECT *,
      -- combine F_House & group number into a unique result
      F_House * 10000 +
      -- start a new sub group for every F_House or after @batchSize rows
      Sum(CASE WHEN rn % @batchSize = 1 THEN 1 ELSE 0 END)
      Over (ORDER BY F_House, rn 
            ROWS Unbounded Preceding) AS F_Batch
   FROM baseRowNums
 )
,GUIDs as
 (
   select F_Batch, MAX(newid()) as GUID
   from batchNums
   group by F_Batch
 )
-- select * from
--from batchNums join GUIDs 
--  on batchNums.F_Batch = GUIDs.F_Batch
select F_House, GUID, count(*)
from batchNums join GUIDs 
  on batchNums.F_Batch = GUIDs.F_Batch
group by F_House, GUID
order by F_House, count(*) desc

请参阅Fiddle

答案 2 :(得分:0)

实际上,我已经用游标尝试了NTILE()并且速度非常快(我的意思是它比1 000分钟快10 000行)。 10 000行,持续0秒。

100 000行,持续3秒。

1 000 000行,持续34秒。

10 000 000行,持续6分钟

线性增长的复杂性很好。

     -- Needed to generate big data
 DECLARE @Naturals TABLE (ID INT)
 INSERT INTO @Naturals (ID)
 VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10)

 DECLARE @TestData TABLE
 (
    LINK INT,
    F_House INT,
    F_Batch UNIQUEIDENTIFIER
 )

 INSERT INTO @TestData (LINK, F_House)
 SELECT ROW_NUMBER() OVER (order by T1.ID), ROW_NUMBER() OVER (order by T1.ID) % 5 
 FROM 
 @Naturals T1
 CROSS JOIN @Naturals T2
 CROSS JOIN @Naturals T3
 CROSS JOIN @Naturals T4
 --CROSS JOIN @Naturals T5 -- that would give us 100 000

 -- Finished preparing Data (10 000 rows)
 SELECT 'Processing:', COUNT(*) FROM @TestData

 DECLARE @batchSize INT -- That would be amount of rows in each batch
 SET @batchSize = 50

 IF OBJECT_ID('tempdb..#G') IS NOT NULL -- Split set of data into groups. We need to create batches in each group.
    DROP TABLE #G

 SELECT 
    buf.F_House, COUNT(*) AS GroupCount
 INTO #G
 FROM @TestData buf
 GROUP BY buf.F_House -- That logic could be tricky one. Right now simplifying

 DECLARE @F_House INT -- That would be group key



 DECLARE db_cursor CURSOR FOR
 SELECT F_House
 FROM #G
 ORDER BY F_House

 OPEN db_cursor FETCH NEXT FROM db_cursor INTO @F_House

 WHILE @@FETCH_STATUS = 0   
 BEGIN 
    PRINT 'Processing house group: ' + CAST(@F_House AS VARCHAR(10))

    DECLARE @rowsInGroup INT
    SELECT @rowsInGroup = COUNT(*) FROM @TestData
    WHERE F_House = @F_House

    IF OBJECT_ID('tempdb..#TileBatch') IS NOT NULL 
        DROP TABLE #TileBatch

    SELECT 
        T.[NTile], NEWID() AS F_Batch
    INTO #TileBatch
    FROM 
    (
        SELECT distinct
            NTILE(@rowsInGroup / @batchSize) OVER (ORDER BY LINK) AS [NTile]
        from 
            @TestData
        WHERE F_House = @F_House
    ) T

    UPDATE D
    SET D.F_Batch = B.F_Batch
    FROM
    @TestData D
        INNER JOIN
    (
        SELECT 
            *, NTILE(@rowsInGroup / @batchSize) OVER (ORDER BY LINK) AS [NTile]
        from 
            @TestData
        WHERE F_House = @F_House
    ) DT ON D.LINK = DT.LINK
        INNER JOIN
    #TileBatch B ON DT.[NTile] = B.[NTile]
    WHERE D.F_House = @F_House

    FETCH NEXT FROM db_cursor INTO @F_House
 END 

 CLOSE db_cursor   
 DEALLOCATE db_cursor

 SELECT
    buf.F_House, COUNT(distinct F_Batch) AS BatchCountInHouse
 FROM @TestData buf
 GROUP BY buf.F_House
 ORDER BY buf.F_House