对于表中的每个组,我需要将该组拆分为特定数量的记录(批次),并使用相应的批次ID批量标记每个记录。
现在,我基于游标的实现是恕我直言,笨拙。拆分10 000行需要1分钟,不用说,非常慢。任何线索如何更快地完成这项工作?
这是测试脚本。
-- Needed to generate big data
DECLARE @Naturals TABLE (ID INT)
INSERT INTO @Naturals (ID)
VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10)
DECLARE @TestData TABLE
(
LINK INT,
F_House INT,
F_Batch UNIQUEIDENTIFIER
)
INSERT INTO @TestData (LINK, F_House)
SELECT ROW_NUMBER() OVER (order by T1.ID), ROW_NUMBER() OVER (order by T1.ID) % 5
FROM
@Naturals T1
CROSS JOIN @Naturals T2
CROSS JOIN @Naturals T3
CROSS JOIN @Naturals T4
--CROSS JOIN @Naturals T5 -- that would give us 100 000
-- Finished preparing Data (10 000 rows)
SELECT 'Processing:', COUNT(*) FROM @TestData
DECLARE @batchSize INT -- That would be amount of rows in each batch
SET @batchSize = 50
IF OBJECT_ID('tempdb..#G') IS NOT NULL -- Split set of data into groups. We need to create batches in each group.
DROP TABLE #G
SELECT
buf.F_House, COUNT(*) AS GroupCount
INTO #G
FROM @TestData buf
GROUP BY buf.F_House -- That logic could be tricky one. Right now simplifying
DECLARE @F_House INT -- That would be group key
DECLARE db_cursor CURSOR FOR
SELECT F_House
FROM #G
ORDER BY F_House
OPEN db_cursor FETCH NEXT FROM db_cursor INTO @F_House
WHILE @@FETCH_STATUS = 0
BEGIN
PRINT 'Processing house group: ' + CAST(@F_House AS VARCHAR(10))
-- For each group let's create batches
WHILE EXISTS (SELECT 1 FROM @TestData AS itmds
WHERE itmds.F_House = @F_House
AND itmds.F_Batch IS NULL
)
BEGIN
DECLARE @batchLink UNIQUEIDENTIFIER
SET @batchLink = NEWID()
UPDATE itmds
SET itmds.F_Batch = @batchLink
FROM @TestData AS itmds
WHERE itmds.F_House = @F_House
AND itmds.F_Batch IS NULL
AND itmds.LINK IN
(
SELECT TOP (@batchSize)
sub.LINK
FROM @TestData sub
WHERE sub.F_House = @F_House
AND sub.F_Batch IS NULL
)
END
FETCH NEXT FROM db_cursor INTO @F_House
END
CLOSE db_cursor
DEALLOCATE db_cursor
SELECT
buf.F_House, COUNT(distinct F_Batch) AS BatchCountInHouse
FROM @TestData buf
GROUP BY buf.F_House
ORDER BY buf.F_House
预期输出(考虑batchsize = 50)
10 000行/ 5间房屋= 2000行/房屋
2000行/ house / 50(batchSize)= 40批/房
答案 0 :(得分:2)
我会在循环内部使用内部循环来引用分组级别。 然后,您可以从分组迭代到BatchGrouping。但是,正如您所指出的那样,速度是表变量和CTE的问题,因此我在这种情况下使用tempdb#table进行了测试。这样我就可以在插入后进行索引并优化性能。我可以在大约16秒内运行一百万行聚合逻辑。我认为这是可接受的表现。但我的开发盒是一台I7 6700,有16台演出的DDR4和一块SSD。性能时间可能因硬件而异。
--Make up some fake data for example
DECLARE
@Start INT = 1
, @End INT = 100000
;
SET NOCOUNT ON;
IF OBJECT_ID('tempdb..#Temp') IS NOT NULL
DROP TABLE tempdb..#Temp
CREATE Table #Temp (Id INT, Grp int, Val VARCHAR(8), BatchGroup int)
WHILE @Start <= @End
BEGIN
INSERT INTO #Temp (Id, Grp, Val)
VALUES (@Start, CAST(RAND() * 8 AS INT) + 1, LEFT(NEWID(), 8))
SELECT @Start += 1;
END
CREATE CLUSTERED INDEX IX_Temp_Grp ON #Temp(Grp, BatchGroup)
--Determine Batch Size You want for groupings
DECLARE @BatchSize INT = 1000;
--Let's randomly mess with groupings
DECLARE @X INT = 1
WHILE @X <= 4
BEGIN
; WITH x AS
(
SELECT TOP (@BatchSize * 4)
Id
, Grp
, Val
FROM #Temp
WHERE Grp = CAST(RAND() * 8 AS INT) + 1
)
UPDATE x
SET Grp = CAST(RAND() * 8 AS INT) + 1
SELECT @X += 1
END
DECLARE
@CurrentGroup INT = 1
, @CurrentBatch INT = 1
WHILE @CurrentGroup <= (SELECT MAX(Grp) FROM #Temp) -- Exists (SELECT 1 FROM @Temp WHERE BatchGroup IS NULL)
BEGIN
WHILE EXISTS (SELECT 1 FROM #Temp WHERE Grp = @CurrentGroup AND BatchGroup IS NULL)
BEGIN
; WITH x AS
(
SELECT TOP (@BatchSize) *
FROM #Temp
WHERE Grp = @CurrentGroup
AND BatchGroup IS NULL
)
update x
SET BatchGroup = @CurrentBatch
SELECT @CurrentBatch += 1;
END
SELECT @CurrentBatch = 1
SELECT @CurrentGroup += 1;
END
--Proof
Select
Grp
, COUNT(DISTINCT Id)
, COUNT(DISTINCT BatchGroup)
From #Temp
GROUP BY Grp
答案 1 :(得分:1)
这是基于避免光标设置的。分配的F_Batch
是BIGINT:
;with baseRowNum as
(
SELECT LINK, F_House,
-- row number per F_House
Row_Number() Over (PARTITION BY F_House ORDER BY LINK) AS rn
FROM @TestData
)
SELECT *,
-- combine F_House & group number into a unique result
F_House * 10000 +
-- start a new sub group for every F_House or after @batchSize rows
Sum(CASE WHEN rn % @batchSize = 1 THEN 1 ELSE 0 END)
Over (ORDER BY F_House, rn
ROWS Unbounded Preceding) AS F_Batch
FROM baseRowNum
如果您真的需要UNIQUEINDENTIFIER
,可以加入:
;with baseRowNums as
(
SELECT LINK, F_House,
-- row number per F_House
Row_Number() Over (PARTITION BY F_House ORDER BY LINK) AS rn
FROM @TestData
)
,batchNums as
(
SELECT *,
-- combine F_House & group number into a unique result
F_House * 10000 +
-- start a new sub group for every F_House or after @batchSize rows
Sum(CASE WHEN rn % @batchSize = 1 THEN 1 ELSE 0 END)
Over (ORDER BY F_House, rn
ROWS Unbounded Preceding) AS F_Batch
FROM baseRowNums
)
,GUIDs as
(
select F_Batch, MAX(newid()) as GUID
from batchNums
group by F_Batch
)
-- select * from
--from batchNums join GUIDs
-- on batchNums.F_Batch = GUIDs.F_Batch
select F_House, GUID, count(*)
from batchNums join GUIDs
on batchNums.F_Batch = GUIDs.F_Batch
group by F_House, GUID
order by F_House, count(*) desc
请参阅Fiddle。
答案 2 :(得分:0)
实际上,我已经用游标尝试了NTILE()并且速度非常快(我的意思是它比1 000分钟快10 000行)。 10 000行,持续0秒。
100 000行,持续3秒。
1 000 000行,持续34秒。
10 000 000行,持续6分钟
线性增长的复杂性很好。
-- Needed to generate big data
DECLARE @Naturals TABLE (ID INT)
INSERT INTO @Naturals (ID)
VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10)
DECLARE @TestData TABLE
(
LINK INT,
F_House INT,
F_Batch UNIQUEIDENTIFIER
)
INSERT INTO @TestData (LINK, F_House)
SELECT ROW_NUMBER() OVER (order by T1.ID), ROW_NUMBER() OVER (order by T1.ID) % 5
FROM
@Naturals T1
CROSS JOIN @Naturals T2
CROSS JOIN @Naturals T3
CROSS JOIN @Naturals T4
--CROSS JOIN @Naturals T5 -- that would give us 100 000
-- Finished preparing Data (10 000 rows)
SELECT 'Processing:', COUNT(*) FROM @TestData
DECLARE @batchSize INT -- That would be amount of rows in each batch
SET @batchSize = 50
IF OBJECT_ID('tempdb..#G') IS NOT NULL -- Split set of data into groups. We need to create batches in each group.
DROP TABLE #G
SELECT
buf.F_House, COUNT(*) AS GroupCount
INTO #G
FROM @TestData buf
GROUP BY buf.F_House -- That logic could be tricky one. Right now simplifying
DECLARE @F_House INT -- That would be group key
DECLARE db_cursor CURSOR FOR
SELECT F_House
FROM #G
ORDER BY F_House
OPEN db_cursor FETCH NEXT FROM db_cursor INTO @F_House
WHILE @@FETCH_STATUS = 0
BEGIN
PRINT 'Processing house group: ' + CAST(@F_House AS VARCHAR(10))
DECLARE @rowsInGroup INT
SELECT @rowsInGroup = COUNT(*) FROM @TestData
WHERE F_House = @F_House
IF OBJECT_ID('tempdb..#TileBatch') IS NOT NULL
DROP TABLE #TileBatch
SELECT
T.[NTile], NEWID() AS F_Batch
INTO #TileBatch
FROM
(
SELECT distinct
NTILE(@rowsInGroup / @batchSize) OVER (ORDER BY LINK) AS [NTile]
from
@TestData
WHERE F_House = @F_House
) T
UPDATE D
SET D.F_Batch = B.F_Batch
FROM
@TestData D
INNER JOIN
(
SELECT
*, NTILE(@rowsInGroup / @batchSize) OVER (ORDER BY LINK) AS [NTile]
from
@TestData
WHERE F_House = @F_House
) DT ON D.LINK = DT.LINK
INNER JOIN
#TileBatch B ON DT.[NTile] = B.[NTile]
WHERE D.F_House = @F_House
FETCH NEXT FROM db_cursor INTO @F_House
END
CLOSE db_cursor
DEALLOCATE db_cursor
SELECT
buf.F_House, COUNT(distinct F_Batch) AS BatchCountInHouse
FROM @TestData buf
GROUP BY buf.F_House
ORDER BY buf.F_House