SQL Server将值集拆分为5个组,每个组应该sum(count)
均匀分布。
表格仅包含2列rid
和count
。
create table t1(rid int, count int)
insert into t1
values (1, 4567), (2, 3256), (3, 5678), (4, 934),
(5, 1099), (6, 3990), (7, 780), (8, 6784),
(9, 7854), (10, 435), (11, 3455), (12, 4897),
(13, 8849), (14, 1019), (15, 2387)
实际表是
rid count
---------
1 4567
2 3256
3 5678
4 934
5 1099
6 3990
7 780
8 6784
9 7854
10 435
11 3455
12 4897
13 8849
14 1019
15 2387
我需要动态地将值分成5组,每组应该sum(count)
均匀分布
列的总和相当于55500.我需要将总和除以55500/5 = 11100。我们需要将值分为5组,每组应该sum(count)
均匀分布,相当于11110(大约)
答案 0 :(得分:1)
我将从5个随机选择的小组开始:
select t.*,
ntile(5) over (order by newid()) as grp
from t;
总和应该非常接近。如果你有很多记录并且计数分布合理,那么第n个样本通常做得很好:
select t.*,
(row_number() over (order by count) % 5) as grp
from t;
如果您的count
大小非常不同,并且您需要最佳解决方案,那么您就遇到了一个难题。
答案 1 :(得分:1)
您可以尝试使用此脚本。
;WITH CTE AS (
SELECT * ,
RN = ROW_NUMBER() OVER (ORDER BY [count] DESC)
FROM T1
)
,CTE2 AS (
SELECT *,
RN2 = ROW_NUMBER() OVER(ORDER BY CEILING( RN / 5.00 ), (( 1 - CEILING( RN / 5.00 )) * [COUNT] ) DESC )
FROM CTE
)
SELECT
CTE2.rid,
CTE2.[count],
((RN2+1)%5) +1 GroupIndex,
SUM(CTE2.[count]) OVER (PARTITION BY ((RN2+1)%5)) CmlTotal
FROM CTE2
结果:
rid count GroupIndex CmlTotal
----------- ----------- -------------------- -----------
3 5678 1 10687
6 3990 1 10687
14 1019 1 10687
5 1099 2 10563
1 4567 2 10563
12 4897 2 10563
15 2387 3 11671
10 435 3 11671
13 8849 3 11671
9 7854 4 11890
7 780 4 11890
2 3256 4 11890
11 3455 5 11173
4 934 5 11173
8 6784 5 11173
答案 2 :(得分:0)
我在这里,我所做的是创建一个带有标识列和额外列([Group])的临时表。数字按降序大小顺序插入。然后,我写了一个LOOP,它在[Group]列中最大的5个数字旁边插入第1到第5组,然后翻转并将第5组到第1组插入到接下来的5个最大数字中,然后再次翻转,所以直到它到达表的末尾。
CREATE TABLE #T1
(
RID INT IDENTITY(1,1),
[Count] INT,
[Group] INT
)
INSERT INTO #T1 ([Count])
SELECT [Count] FROM T1 ORDER BY [Count] DESC
GO
DECLARE @ROWCOUNT INT = 1
WHILE @ROWCOUNT <= (SELECT MAX(RID) FROM #T1)
BEGIN
DECLARE @COUNT INT = (SELECT TOP 1 [COUNT]
FROM #T1 WHERE [GROUP] IS NULL ORDER BY [COUNT] DESC)
DECLARE @GROUP INT = 1
WHILE @GROUP <=5
BEGIN
UPDATE #T1 SET [GROUP] = @GROUP WHERE [COUNT] = @COUNT
SET @COUNT = (SELECT TOP 1 [COUNT] FROM #T1 WHERE [GROUP] IS NULL ORDER BY [COUNT] DESC)
SET @GROUP = @GROUP + 1
SET @ROWCOUNT = @ROWCOUNT +1
END
SET @GROUP = @GROUP - 1
WHILE @GROUP > 0
BEGIN
UPDATE #T1 SET [GROUP] = @GROUP WHERE [COUNT] = @COUNT
SET @COUNT = (SELECT TOP 1 [COUNT] FROM #T1 WHERE [GROUP] IS NULL ORDER BY [COUNT] DESC)
SET @GROUP = @GROUP - 1
SET @ROWCOUNT = @ROWCOUNT +1
END
END
下面的代码只显示了五组中每一组的实际数字,并且还显示了数字之和除以五的差异。
DECLARE @AVGROUP INT = (SELECT SUM([COUNT])/5 FROM #T1);
WITH CTE (SUMCOUNT) AS
(
SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 1
UNION
SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 2
UNION
SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 3
UNION
SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 4
UNION
SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 5
)
,
CTE1 (SUMCOUNT,VARIANCE) AS
(
SELECT SUMCOUNT,@AVGROUP-SUMCOUNT FROM CTE
)
SELECT * FROM CTE1
这准确吗?换句话说,这些数字的差异范围1274似乎是否均匀分布?我认为如果需要的话可能会更准确,如果这足够准确,那就好了。
下面的代码显示了如何组成这些组:
DECLARE @AVGROUP INT = (SELECT SUM([COUNT])/5 FROM #T1);
WITH CTE ([GROUP],N1,N2,N3,SUMCOUNT) AS
(
SELECT '1',
(SELECT [COUNT] FROM #T1 WHERE RID = 1),
(SELECT [COUNT] FROM #T1 WHERE RID = 10),
(SELECT [COUNT] FROM #T1 WHERE RID = 11),
(SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 1)
UNION
SELECT '2',
(SELECT [COUNT] FROM #T1 WHERE RID = 2),
(SELECT [COUNT] FROM #T1 WHERE RID = 9),
(SELECT [COUNT] FROM #T1 WHERE RID = 12),
(SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 2)
UNION
SELECT '3',
(SELECT [COUNT] FROM #T1 WHERE RID = 3),
(SELECT [COUNT] FROM #T1 WHERE RID = 8),
(SELECT [COUNT] FROM #T1 WHERE RID = 13),
(SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 3)
UNION
SELECT '4',
(SELECT [COUNT] FROM #T1 WHERE RID = 4),
(SELECT [COUNT] FROM #T1 WHERE RID = 7),
(SELECT [COUNT] FROM #T1 WHERE RID = 14),
(SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 4)
UNION
SELECT '5',
(SELECT [COUNT] FROM #T1 WHERE RID = 5),
(SELECT [COUNT] FROM #T1 WHERE RID = 6),
(SELECT [COUNT] FROM #T1 WHERE RID = 15),
(SELECT SUM([COUNT]) FROM #T1 WHERE [GROUP] = 5)
)
,
CTE1 ([GROUP],N1,N2,N3,SUMCOUNT,VARIANCE) AS
(
SELECT [GROUP],N1,N2,N3,SUMCOUNT,@AVGROUP-SUMCOUNT FROM CTE
)
SELECT * FROM CTE1