我的表t_stats
包含列id (INT)
和列ratio (DECIMAL(8,4))
。
id
是唯一的。
我想查询表t_stats
,以便选择具有相同AVG(ratio)
(最接近的)的3个组。
可以使用临时表来完成,只要我可以将其作为脚本或存储过程运行。
编辑:这是具体的例子:
INPUT:
id ratio
-- -----
24 0.930000
25 0.390000
26 0.800000
27 0.920000
28 0.550000
30 0.810000
31 0.770000
32 0.800000
33 0.590000
36 0.760000
37 0.910000
40 0.690000
43 0.390000
45 0.310000
46 0.760000
47 0.710000
54 0.710000
55 0.950000
57 0.920000
60 0.890000
62 0.700000
66 0.890000
68 0.950000
107 0.760000
559 0.990000
560 0.540000
565 0.430000
566 0.830000
568 0.590000
579 0.970000
599 0.900000
623 0.450000
749 0.800000
750 0.970000
753 0.820000
754 0.730000
766 0.620000
768 0.430000
770 0.790000
838 0.700000
875 0.835000
987 0.900000
988 0.740000
1157 0.850000
1250 0.630000
1328 0.860000
2171 0.900000
2176 0.520000
2177 0.980000
2178 0.940000
2180 0.970000
2184 0.990000
2187 0.950000
2188 0.940000
2189 0.920000
2195 0.990000
2233 0.900000
2234 0.940000
2235 0.950000
2240 0.980000
2243 0.920000
2253 0.900000
2266 0.530000
2269 0.920000
2270 0.970000
2271 0.750000
2272 0.820000
2275 0.910000
2277 0.930000
2281 0.690000
2282 0.710000
2288 0.840000
2528 0.870000
2778 0.950000
2814 0.990000
输出:
groupId id ratio
------- -- -----
1 24 0.930000
1 25 0.390000
1 27 0.920000
1 30 0.810000
1 32 0.800000
1 36 0.760000
1 54 0.710000
1 60 0.890000
1 559 0.990000
1 560 0.540000
1 566 0.830000
1 568 0.590000
1 623 0.450000
1 750 0.970000
1 838 0.700000
1 987 0.900000
1 1157 0.850000
1 2178 0.940000
1 2180 0.970000
1 2253 0.900000
1 2269 0.920000
1 2271 0.750000
1 2281 0.690000
1 2778 0.950000
1 2814 0.990000
2 26 0.800000
2 28 0.550000
2 31 0.770000
2 40 0.690000
2 45 0.310000
2 55 0.950000
2 57 0.920000
2 66 0.890000
2 107 0.760000
2 565 0.430000
2 579 0.970000
2 753 0.820000
2 754 0.730000
2 766 0.620000
2 875 0.835000
2 1328 0.860000
2 2176 0.520000
2 2177 0.980000
2 2184 0.990000
2 2187 0.950000
2 2189 0.920000
2 2233 0.900000
2 2234 0.940000
2 2275 0.910000
2 2282 0.710000
3 33 0.590000
3 37 0.910000
3 43 0.390000
3 46 0.760000
3 47 0.710000
3 62 0.700000
3 68 0.950000
3 599 0.900000
3 749 0.800000
3 768 0.430000
3 770 0.790000
3 988 0.740000
3 1250 0.630000
3 2171 0.900000
3 2188 0.940000
3 2195 0.990000
3 2235 0.950000
3 2240 0.980000
3 2243 0.920000
3 2266 0.530000
3 2270 0.970000
3 2272 0.820000
3 2277 0.930000
3 2288 0.840000
3 2528 0.870000
所以我想制作3组n
值,并针对特定的平均值x
。 (例如,n=30
和0.75 < x < 0.85
看起来像3组,每组30个值,每个组有0.75 < AVG(ratio) < 0.85
个,而id
只能属于1个组。)
因此每组的平均值几乎相同,接近x
:
groupId avg(ratio)
------- ----------
1 0.805600
2 0.789000
3 0.797600
答案 0 :(得分:3)
这是一个有点像draft的T-SQL程序版本,根据需要每轮只对草稿顺序进行优化。
如果要挑选所有项目,这种“竞争”性质似乎会导致略低于完美的比率,但从好的方面来看,你基本上有一个O(N ^ 2)算法,因为它基本上是一分钟循环中的函数(考虑到group by
子句,可能是乐观的)。它也是确定性的,如果有必要,应该在另一层实现相当简单。
-- SET THESE!
declare @numberOfGroups int = 3
declare @itemsPerGroup int = 25
declare @targetRatio decimal(8,4) = .8
-- /SET
set nocount on
-- Create a table of items
declare @t_stats table (
id int not null primary key
, ratio decimal(8,4) not null
, grp int null
)
insert into @t_stats (id, ratio) values
(24,0.930000), (25,0.390000), (26,0.800000),
(27,0.920000), (28,0.550000), (30,0.810000),
(31,0.770000), (32,0.800000), (33,0.590000),
(36,0.760000), (37,0.910000), (40,0.690000),
(43,0.390000), (45,0.310000), (46,0.760000),
(47,0.710000), (54,0.710000), (55,0.950000),
(57,0.920000), (60,0.890000), (62,0.700000),
(66,0.890000), (68,0.950000), (107,0.760000),
(559,0.990000), (560,0.540000), (565,0.430000),
(566,0.830000), (568,0.590000), (579,0.970000),
(599,0.900000), (623,0.450000), (749,0.800000),
(750,0.970000), (753,0.820000), (754,0.730000),
(766,0.620000), (768,0.430000), (770,0.790000),
(838,0.700000), (875,0.835000), (987,0.900000),
(988,0.740000), (1157,0.850000), (1250,0.630000),
(1328,0.860000), (2171,0.900000), (2176,0.520000),
(2177,0.980000), (2178,0.940000), (2180,0.970000),
(2184,0.990000), (2187,0.950000), (2188,0.940000),
(2189,0.920000), (2195,0.990000), (2233,0.900000),
(2234,0.940000), (2235,0.950000), (2240,0.980000),
(2243,0.920000), (2253,0.900000), (2266,0.530000),
(2269,0.920000), (2270,0.970000), (2271,0.750000),
(2272,0.820000), (2275,0.910000), (2277,0.930000),
(2281,0.690000), (2282,0.710000), (2288,0.840000),
(2528,0.870000), (2778,0.950000), (2814,0.990000)
-- Create a table of groups
declare @groups table (
grp int not null primary key identity
)
while (select isnull(max(grp), 0) from @groups) < @numberOfGroups begin
insert into @groups default values
end
-- Check that we have enough items to fill all groups
if @numberOfGroups * @itemsPerGroup <= (select count(*) from @t_stats) begin
-- Groups now pick the best-fitting items one at a time
while (select count(*) from @t_stats where grp is not null) < (select count(*) * @itemsPerGroup from @groups) begin
declare @grp int, @Num int, @ratio decimal(8,4), @id int
-- Find the group with the least number of items or the worst ratio
select top 1 @grp = grp, @Num = Num, @ratio = ratio
from (
select g.grp
, count(i.grp) as Num
, isnull(avg(i.ratio), 0.0) as ratio
, abs(@targetRatio - avg(i.ratio)) as RatioDist
from @groups g
left join @t_stats i on g.grp = i.grp
group by g.grp
) as a
order by Num, RatioDist, grp
-- Let that group make their best pick
select top 1 @id = id
from (
select id
, abs(((ratio + (@ratio * @Num)) / (@Num + 1)) - @targetRatio) as NewRatioDist
from @t_stats
where grp is null
) as a
order by NewRatioDist
-- Update the items table based upon the pick
update @t_stats set grp = @grp where id = @id
end
end
else begin
-- Not enought items
raiserror('Too many groups or items per group.', 17, 0)
end
-- Display the results
select grp, count(*) as Num, avg(ratio) as ratio
from @t_stats
group by grp
order by grp
答案 1 :(得分:2)
试试这个
Declare @t Table (Id Int, Ratio DECIMAL(8,2))
Insert Into @t Values(1,0.5),(2,0.55),(3,0.97),(4,0.77),(5,0.97),(6,0.99),(7,1.0),(8,0.15),(9,0.33)
DECLARE @MeanSum DECIMAL(8,2)
SELECT @MeanSum =SUM(Ratio)/3 FROM @T
;WITH Cte (Id,Ratio,Ids,RatioValues,RatioTotalWeight,Level) AS
(
SELECT Id
,Ratio
, ',' + CAST(Id AS VARCHAR(MAX))
,',' + CAST(Ratio AS VARCHAR(MAX))
,CAST(Ratio AS DECIMAL(8,2))
,1
FROM @t
UNION ALL
SELECT
p.Id
, p.Ratio
,c.Ids + ',' + CAST(p.Id AS VARCHAR(MAX))
,c.RatioValues + ',' + CAST(p.Ratio AS VARCHAR(MAX))
,CAST(c.RatioTotalWeight + p.Ratio AS DECIMAL(8,2))
,c.Level+1
FROM @t AS p JOIN Cte c ON p.Id < c.Id
WHERE c.Level < 3
),CTEOf3Groups AS(
SELECT
Ids = STUFF(Ids,1,1,'')
,RatioValues
,RatioTotalWeight
, FirstChar = SUBSTRING(STUFF(Ids,1,1,''),0,CHARINDEX(',',STUFF(Ids,1,1,'')))
,DENSE_RANK() OVER(ORDER BY ABS(RatioTotalWeight - @MeanSum)) [rank] -- gets the closest distance
FROM CTE
),CteGetTheRanks AS(
Select *, Rn = Row_Number() Over(Partition By FirstChar Order by FirstChar, [Rank] )
From CTEOf3Groups)
,CteGroups AS(
SELECT [GroupId] = Row_Number() Over( Order By (Select 1)), Ids,[Rank]
FROM CteGetTheRanks
Where [Rank]<=3
AND Rn = 1)
SELECT X.[GroupId],X.Id,t.Ratio
FROM
(
SELECT F1.[GroupId],
O.splitdata AS ID
FROM
(
SELECT *,
CAST('<X>'+REPLACE(F.Ids,',','</X><X>')+'</X>' AS XML) AS xmlfilter
FROM CteGroups F
)F1
CROSS APPLY
(
SELECT fdata.D.value('.','varchar(50)') AS splitdata
FROM f1.xmlfilter.nodes('X') As fdata(D)
) O
)X JOIN @t t ON t.Id = X.ID
ORDER BY 1,2
OPTION (MAXRECURSION 0)
<强>结果强>
<强>被修改强> 我尝试了你提供的样本数据(ddl提供了你的参考)
Declare @t Table (Id Int, Ratio DECIMAL(8,2))
Insert Into @t Values
(52,0.930000),(53,0.390000),(54,0.800000),(55,0.920000),(56,0.550000),
(58,0.810000),(59,0.770000),(60,0.800000),(61,0.590000),(64,0.760000),
(65,0.910000),(68,0.690000),(71,0.390000),(73,0.310000),(74,0.760000),
(75,0.710000),(82,0.710000),(83,0.950000),(85,0.920000),(88,0.890000),
(90,0.700000),(94,0.890000),(96,0.950000),(135,0.760000),(587,0.990000),
(588,0.540000),(593,0.430000),(594,0.830000),(596,0.590000),(607,0.970000),
(627,0.900000),(651,0.450000),(777,0.800000),(778,0.970000),(781,0.820000),
(782,0.730000),(794,0.620000),(796,0.430000),(798,0.790000),(866,0.700000),
(903,0.835000),(1015,0.900000),(1016,0.740000),(1185,0.850000),(1278,0.630000),
(1356,0.860000),(2199,0.900000),(2204,0.520000),(2205,0.980000),(2206,0.940000),
(2208,0.970000),(2212,0.990000),(2215,0.950000),(2216,0.940000),(2217,0.920000),
(2223,0.990000),(2261,0.900000),(2262,0.940000),(2263,0.950000),(2268,0.980000),
(2271,0.920000),(2281,0.900000),(2294,0.530000),(2297,0.920000),(2298,0.970000),
(2299,0.750000),(2300,0.820000),(2303,0.910000),(2305,0.930000),(2309,0.690000),
(2310,0.710000),(2316,0.840000),(2556,0.870000),(2806,0.950000),(2842,0.990000),
(2844,0.710000),(2977,0.730000),(2985,0.960000),(3008,0.710000),(3042,0.910000),
(3061,0.830000),(3243,0.900000),(3346,0.800000),(3371,0.800000),(3497,0.990000),
(3838,0.730000),(4000,0.980000),(4001,0.890000),(4002,0.850000),(4003,0.490000),
(4004,0.970000),(4009,0.930000),(4032,0.930000),(4095,0.460000),(4428,0.610000),
(4438,0.960000),(4439,0.930000),(4445,0.650000),(4446,0.660000),(4447,0.490000),
(4455,0.880000),(4457,0.890000),(4460,0.980000),(4469,0.930000),(4473,0.980000),
(4474,0.950000),(4475,0.940000),(4481,0.400000),(4489,0.760000),(4490,0.470000)
结果是
执行所需的时间是27秒。请从你的结尾(也是结果)进行测试并告诉我。
<强>被修改强>
75记录DDL
Declare @t Table (Id Int, Ratio DECIMAL(8,4))
Insert Into @t Values
(24,0.930000),(25,0.390000),(26,0.800000),(27,0.920000),
(28,0.550000),(30,0.810000),(31,0.770000),(32,0.800000),
(33,0.590000),(36,0.760000),(37,0.910000),(40,0.690000),
(43,0.390000),(45,0.310000),(46,0.760000),(47,0.710000),
(54,0.710000),(55,0.950000),(57,0.920000),(60,0.890000),
(62,0.700000),(66,0.890000),(68,0.950000),(107,0.760000),
(559,0.990000),(560,0.540000),(565,0.430000),(566,0.830000),
(568,0.590000),(579,0.970000),(599,0.900000),(623,0.450000),
(749,0.800000),(750,0.970000),(753,0.820000),(754,0.730000),
(766,0.620000),(768,0.430000),(770,0.790000),(838,0.700000),
(875,0.835000),(987,0.900000),(988,0.740000),(1157,0.850000),
(1250,0.630000),(1328,0.860000),(2171,0.900000),(2176,0.520000),
(2177,0.980000),(2178,0.940000),(2180,0.970000),(2184,0.990000),
(2187,0.950000),(2188,0.940000),(2189,0.920000),(2195,0.990000),
(2233,0.900000),(2234,0.940000),(2235,0.950000),(2240,0.980000),
(2243,0.920000),(2253,0.900000),(2266,0.530000),(2269,0.920000),
(2270,0.970000),(2271,0.750000),(2272,0.820000),(2275,0.910000),
(2277,0.930000),(2281,0.690000),(2282,0.710000),(2288,0.840000),
(2528,0.870000),(2778,0.950000),(2814,0.990000)
答案 2 :(得分:1)
SQL确实不是解决此类问题的最佳工具。
然而,有时用TSQL锤击一些螺丝很有趣!!
这是一项努力,可以获得75行示例数据的以下内容:
GroupId Average Count
----------- --------------------------------------- -----------
1 0.798400 25
2 0.796600 25
3 0.797200 25
在我的机器上不到一秒钟。
只有一点需要注意:这种方法存在很大的缺陷,但是如果你需要在SQL中做这件事,你可能会对它们进行一些修改,我只是没有时间。
-- **Expects data in table t_stats (id, ratio)**
if OBJECT_ID('tempdb..#data') is not null drop table #data
if OBJECT_ID('tempdb..#pairsets') is not null drop table #pairsets
if OBJECT_ID('tempdb..#pairseed') is not null drop table #pairseed
if OBJECT_ID('tempdb..#match') is not null drop proc #match
go
-- rather horrible routine using dsql to find either:
-- 1) groups of values that sum to exactly @targetsum (only if @targetsum non null)
-- 2) the group containing the least values that includes data id @includeid and where the sum is within +- @targetsumrange
create proc #match(@targetsum DECIMAL(8,4), @includeid int, @targetsumrange DECIMAL(8,4)) as
begin
set nocount on
declare @nearestmatch bit = 0
if @targetsum is null set @nearestmatch = 1
declare @combination table (value int, asstring varchar(10), alias varchar(50))
declare @savedpairseed int = (select pairseed from #pairseed)
declare @stmtTemplate varchar(max) = 'declare @pairseed int = (select pairseed from #pairseed)
declare @DistSum DECIMAL(8,4)
<DeclareVars>
declare candloop cursor for select <SelectList>, <DistanceSum> as Dist_sum from <TableList> where <IdCheck> <SumCheck>
open candloop
fetch next from candloop into <VarsList>, @DistSum
while @@fetch_status = 0
begin
if (select count(*) from #data where id in (<VarsList>)) = <VarsCount>
begin
<DeleteData>
<InsertPairs>
set @pairseed = @pairseed + 1
end
fetch next from candloop into <VarsList>, @DistSum
end
close candloop
deallocate candloop
update #pairseed set pairseed = @pairseed '
declare @combinations int = 1
declare @maxcombinations int = 8
while @combinations <= @maxcombinations
begin
insert @combination select @combinations, cast(@combinations as varchar(10)), char(ascii('a') + @combinations-1)
declare @DeclareVars varchar(max) = ''
declare @SelectList varchar(max) = ''
declare @TableList varchar(max) = ''
declare @IdCheck varchar(max) = ''
declare @DistanceSum varchar(max) = ''
declare @InsertPairs varchar(max) = ''
declare @VarsList varchar(max) = ''
declare @SumCheck varchar(max) = ''
declare @DeleteData varchar(max) = 'delete #data where id in (<VarsList>)'
select @DeclareVars = @DeclareVars + 'declare @id'+asstring+ ' int ' from @combination
select @SelectList = @SelectList + alias +'.id, ' from @combination
set @SelectList = SUBSTRING(@selectlist, 1, LEN(@SelectList)-1)
select @TableList = @TableList + '#data '+alias+', ' from @combination
set @TableList = SUBSTRING(@TableList, 1, LEN(@TableList)-1)
select @IdCheck = @IdCheck + a.alias+'.id < '+b.alias+'.id and '
from @combination a join @combination b on a.value+1 = b.value
if LEN(@IdCheck) > 4
set @IdCheck = SUBSTRING(@IdCheck, 1, LEN(@IdCheck)-4) + ' and '
select @DistanceSum = @DistanceSum + alias+'.targetdistance + ' from @combination
set @DistanceSum = SUBSTRING(@DistanceSum, 1, LEN(@DistanceSum)-2)
select @VarsList = @VarsList + '@id'+asstring+ ', ' from @combination
set @VarsList = SUBSTRING(@VarsList, 1, LEN(@VarsList)-1)
select @InsertPairs = @InsertPairs + 'insert #pairsets select @pairseed, @id'+asstring+ ', @DistSum'+ CHAR(10) from @combination
set @SumCheck = @DistanceSum + ' = '+ cast(@Targetsum as varchar(20))
if @nearestmatch = 1
begin
set @SumCheck = '('
select @SumCheck = @SumCheck + alias+'.id = '+CAST(@includeid as varchar(10))+' or ' from @combination
if LEN(@SumCheck) > 4
set @SumCheck = SUBSTRING(@SumCheck, 1, LEN(@SumCheck)-3)
set @SumCheck = @SumCheck + ')'
set @DeleteData = ''
end
declare @stmt varchar(max)
set @stmt = REPLACE(@stmtTemplate, '<DeclareVars>', @DeclareVars)
set @stmt = REPLACE(@stmt, '<DeleteData>', @DeleteData)
set @stmt = REPLACE(@stmt, '<SelectList>', @SelectList)
set @stmt = REPLACE(@stmt, '<TableList>', @TableList)
set @stmt = REPLACE(@stmt, '<IdCheck>', @IdCheck)
set @stmt = REPLACE(@stmt, '<DistanceSum>', @DistanceSum)
set @stmt = REPLACE(@stmt, '<InsertPairs>', @InsertPairs)
set @stmt = REPLACE(@stmt, '<VarsList>', @VarsList)
set @stmt = REPLACE(@stmt, '<VarsCount>', cast(@combinations as varchar(10)))
set @stmt = REPLACE(@stmt, '<SumCheck>', @SumCheck)
exec (@stmt)
set @combinations = @combinations + 1
end
if @nearestmatch = 1
begin
-- above will have recorded all possible matches within range
-- remove all but the closest and reindex the pair ids
declare @bestmatch int
select top 1 @bestmatch = pairid from #pairsets where pairid >= @savedpairseed and ABS(distsum) < @targetsumrange
delete #pairsets where pairid >= @savedpairseed and pairid <> ISNULL(@bestmatch, -1)
delete #data where id in (select id from #pairsets where pairid = @bestmatch)
update #pairsets set pairid = @savedpairseed where pairid = @bestmatch
update #pairseed set pairseed = @savedpairseed+1
end
end
go
set nocount on
-- set the parameters
declare @xmin DECIMAL(8,4) = 0.75
declare @xmax DECIMAL(8,4) = 0.85
declare @xrange DECIMAL(8,4) = @xmax - @xmin
declare @xtarg DECIMAL(8,4) = (@xmin+@xmax) / 2
declare @ngroups int = 3
declare @targetgroupsize int = 25
declare @maxbalancedpair int
-- copy the ratio data (using 75 row data from updated question)
select *, ratio-@xtarg as targetdistance, abs(ratio - @xtarg) as targetdistanceabsolute into #data from t_stats
create table #pairseed (pairseed int)
create table #pairsets (pairid int, id int, distsum DECIMAL(8,4) )
insert #pairseed select 1
-- due to the 2 decimal points and distribution of the data we can find many n-tuples that sum to zero
exec #match 0, 0, 0
select @maxbalancedpair = pairseed-1 from #pairseed
declare @deviants table (id int)
declare @most_deviant int
while exists(select * from #data where id not in (select id from @deviants))
begin
select top 1 @most_deviant = id from #data where id not in (select id from @deviants) order by targetdistanceabsolute desc
insert @deviants select @most_deviant
exec #match null, @most_deviant, @xrange
end
-- in general there would have to be some backtracking here
-- now its a box-packing problem, but for simplicity just assign them round robin
declare @output_group_pairs table (groupid int, pairid int)
declare @groupidx int = 1
declare @numgroups int = 3
declare @pairid int
select @pairid = pairseed-1 from #pairseed
while @pairid >= 0
begin
insert @output_group_pairs select @groupidx, @pairid
set @pairid = @pairid - 1
set @groupidx = (@groupidx % @numgroups) + 1
end
-- wimpy effort at redistributing the groups evenly
-- todo: many cases will not work, should use a proper algorithm
declare @maxiter int = 100
declare @previouspairs table (pairid int)
declare @previousgroups table (groupid int)
while exists(select groupid from @output_group_pairs a join #pairsets b on a.pairid = b.pairid group by groupid having COUNT(id) < @targetgroupsize)
begin
set @maxiter = @maxiter-1
if @maxiter = 0 break
declare @groupid int = -1
declare @amountout int
select @groupid = groupid, @amountout = @targetgroupsize-COUNT(*)
from @output_group_pairs a join #pairsets b on a.pairid = b.pairid
where groupid not in (select groupid from @previousgroups)
group by groupid having COUNT(*) < @targetgroupsize
if @groupid = -1 break
declare @targetpair int = -1
select @targetpair = a.pairid from @output_group_pairs a
join (select pairid from #pairsets group by pairid having COUNT(*) <= @amountout) b on a.pairid = b.pairid
join (select groupid, count(id) groupcount from @output_group_pairs a join #pairsets b on a.pairid = b.pairid group by groupid) group_counts on a.groupid = group_counts.groupid
where a.pairid not in (select pairid from @previouspairs)
order by abs(@amountout - groupcount) asc
if @targetpair = -1
begin
insert @previousgroups select @groupid
end
else
begin
insert @previouspairs select @targetpair
update @output_group_pairs set groupid = @groupid where pairid = @targetpair
end
end
set @maxiter = 100
delete @previouspairs
delete @previousgroups
while exists(select groupid from @output_group_pairs a join #pairsets b on a.pairid = b.pairid group by groupid having COUNT(id) > @targetgroupsize)
begin
set @maxiter = @maxiter-1
if @maxiter = 0 break
set @groupid = -1
set @amountout = null
select @groupid = groupid, @amountout = COUNT(*)-@targetgroupsize
from @output_group_pairs a join #pairsets b on a.pairid = b.pairid
where groupid not in (select groupid from @previousgroups)
group by groupid having COUNT(*) > @targetgroupsize
if @groupid = -1 break
set @targetpair = -1
select @targetpair = a.pairid from @output_group_pairs a
join (select pairid from #pairsets group by pairid having COUNT(*) <= @amountout) b on a.pairid = b.pairid
join (select groupid, count(id) groupcount from @output_group_pairs a join #pairsets b on a.pairid = b.pairid group by groupid) group_counts on a.groupid = group_counts.groupid
where a.pairid not in (select pairid from @previouspairs)
order by abs(@amountout - groupcount) asc
if @targetpair = -1
begin
insert @previousgroups select @groupid
end
else
begin
insert @previouspairs select @targetpair
delete @output_group_pairs where pairid = @targetpair
end
end
-- output groups and their stats
select GroupId, Id from @output_group_pairs a join #pairsets b on a.pairid = b.pairid order by 1, 2
select a.GroupId, AVG(c.ratio) as [Average] , count(*) as [Count]
from @output_group_pairs a
join #pairsets b on a.pairid = b.pairid
join t_stats c on b.id = c.id
group by a.groupid
go
drop table #data
drop table #pairsets
drop table #pairseed
drop proc #match
答案 3 :(得分:0)
哦,如果一个组中的项目数量是一个精确的要求,这里的版本对于盒子包装部分使用相同的详尽匹配,但它的速度要慢得多。
-- **Expects data in table t_stats (id, ratio)**
if OBJECT_ID('tempdb..#data') is not null drop table #data
if OBJECT_ID('tempdb..#data_pairs') is not null drop table #data_pairs
if OBJECT_ID('tempdb..#pairsets') is not null drop table #pairsets
if OBJECT_ID('tempdb..#pairseed') is not null drop table #pairseed
if OBJECT_ID('tempdb..#match') is not null drop proc #match
go
-- rather horrible routine using dsql to find either:
-- 1) groups of values that sum to exactly @targetsum (only if @targetsum non null)
-- 2) the group containing the least values that includes data id @includeid and where the sum is within +- @targetsumrange
create proc #match(@targetsum DECIMAL(8,4), @maxcombinations int, @includeid int, @targetsumrange DECIMAL(8,4)) as
begin
set nocount on
declare @nearestmatch bit = 0
if @targetsum is null set @nearestmatch = 1
declare @combination table (value int, asstring varchar(10), alias varchar(50))
declare @savedpairseed int = (select pairseed from #pairseed)
declare @stmtTemplate varchar(max) = 'declare @pairseed int = (select pairseed from #pairseed)
declare @DistSum DECIMAL(8,4)
<DeclareVars>
declare candloop cursor for select <SelectList>, <DistanceSum> as Dist_sum from <TableList> where <IdCheck> <SumCheck>
open candloop
fetch next from candloop into <VarsList>, @DistSum
while @@fetch_status = 0
begin
if (select count(*) from #data where id in (<VarsList>)) = <VarsCount>
begin
<DeleteData>
<InsertPairs>
set @pairseed = @pairseed + 1
end
fetch next from candloop into <VarsList>, @DistSum
end
close candloop
deallocate candloop
update #pairseed set pairseed = @pairseed '
declare @combinations int = 1
while @combinations <= @maxcombinations
begin
insert @combination
select @combinations, cast(@combinations as varchar(10)), CHAR(ASCII('a')+ (@combinations-1)%26) + char(ascii('a') + @combinations-1)
declare @DeclareVars varchar(max) = ''
declare @SelectList varchar(max) = ''
declare @TableList varchar(max) = ''
declare @IdCheck varchar(max) = ''
declare @DistanceSum varchar(max) = ''
declare @InsertPairs varchar(max) = ''
declare @VarsList varchar(max) = ''
declare @SumCheck varchar(max) = ''
declare @DeleteData varchar(max) = 'delete #data where id in (<VarsList>)'
select @DeclareVars = @DeclareVars + 'declare @id'+asstring+ ' int ' from @combination
select @SelectList = @SelectList + alias +'.id, ' from @combination
set @SelectList = SUBSTRING(@selectlist, 1, LEN(@SelectList)-1)
select @TableList = @TableList + '#data '+alias+', ' from @combination
set @TableList = SUBSTRING(@TableList, 1, LEN(@TableList)-1)
select @IdCheck = @IdCheck + a.alias+'.id < '+b.alias+'.id and '
from @combination a join @combination b on a.value+1 = b.value
if LEN(@IdCheck) > 4
set @IdCheck = SUBSTRING(@IdCheck, 1, LEN(@IdCheck)-4) + ' and '
select @DistanceSum = @DistanceSum + alias+'.targetdistance + ' from @combination
set @DistanceSum = SUBSTRING(@DistanceSum, 1, LEN(@DistanceSum)-2)
select @VarsList = @VarsList + '@id'+asstring+ ', ' from @combination
set @VarsList = SUBSTRING(@VarsList, 1, LEN(@VarsList)-1)
select @InsertPairs = @InsertPairs + 'insert #pairsets select @pairseed, @id'+asstring+ ', @DistSum'+ CHAR(10) from @combination
set @SumCheck = @DistanceSum + ' = '+ cast(@Targetsum as varchar(20))
if @nearestmatch = 1
begin
set @SumCheck = '('
select @SumCheck = @SumCheck + alias+'.id = '+CAST(@includeid as varchar(10))+' or ' from @combination
if LEN(@SumCheck) > 4
set @SumCheck = SUBSTRING(@SumCheck, 1, LEN(@SumCheck)-3)
set @SumCheck = @SumCheck + ')'
set @DeleteData = ''
end
declare @stmt varchar(max)
set @stmt = REPLACE(@stmtTemplate, '<DeclareVars>', @DeclareVars)
set @stmt = REPLACE(@stmt, '<DeleteData>', @DeleteData)
set @stmt = REPLACE(@stmt, '<SelectList>', @SelectList)
set @stmt = REPLACE(@stmt, '<TableList>', @TableList)
set @stmt = REPLACE(@stmt, '<IdCheck>', @IdCheck)
set @stmt = REPLACE(@stmt, '<DistanceSum>', @DistanceSum)
set @stmt = REPLACE(@stmt, '<InsertPairs>', @InsertPairs)
set @stmt = REPLACE(@stmt, '<VarsList>', @VarsList)
set @stmt = REPLACE(@stmt, '<VarsCount>', cast(@combinations as varchar(10)))
set @stmt = REPLACE(@stmt, '<SumCheck>', @SumCheck)
exec (@stmt)
set @combinations = @combinations + 1
end
if @nearestmatch = 1
begin
-- above will have recorded all possible matches within range
-- remove all but the closest and reindex the pair ids
declare @bestmatch int
select top 1 @bestmatch = pairid from #pairsets where pairid >= @savedpairseed and ABS(distsum) < @targetsumrange
delete #pairsets where pairid >= @savedpairseed and pairid <> ISNULL(@bestmatch, -1)
delete #data where id in (select id from #pairsets where pairid = @bestmatch)
update #pairsets set pairid = @savedpairseed where pairid = @bestmatch
update #pairseed set pairseed = @savedpairseed+1
end
end
go
set nocount on
-- set the parameters
declare @xmin DECIMAL(8,4) = 0.75
declare @xmax DECIMAL(8,4) = 0.85
declare @xrange DECIMAL(8,4) = @xmax - @xmin
declare @xtarg DECIMAL(8,4) = (@xmin+@xmax) / 2
declare @ngroups int = 3
declare @targetgroupsize int = 5
declare @maxbalancedpair int
-- copy the ratio data (using 75 row data from updated question)
select *, ratio-@xtarg as targetdistance, abs(ratio - @xtarg) as targetdistanceabsolute into #data from t_stats
create table #pairseed (pairseed int)
create table #pairsets (pairid int, id int, distsum DECIMAL(8,4) )
insert #pairseed select 1
-- due to the 2 decimal points and distribution of the data we can find many n-tuples that sum to zero
exec #match 0, 8, 0, 0
select @maxbalancedpair = pairseed-1 from #pairseed
declare @deviants table (id int)
declare @most_deviant int
while exists(select * from #data where id not in (select id from @deviants))
begin
select top 1 @most_deviant = id from #data where id not in (select id from @deviants) order by targetdistanceabsolute desc
insert @deviants select @most_deviant
exec #match null, 8, @most_deviant, @xrange
end
select * into #data_pairs from #pairsets
delete #data
delete #pairsets
update #pairseed set pairseed = 1
insert #data
select pairid, COUNT(*), COUNT(*), COUNT(*) from #data_pairs group by pairid
if (select SUM(ratio) from #data) < @targetgroupsize * @ngroups
begin
raiserror('Cannot match - not enough data', 16, 1)
return
end
-- find the minimum number of matches that will reach targetgroupsize
declare @maxmatches int = -1
declare @matchcount int
declare @matchsum int = 0
declare maxmatchcount cursor for select CAST(ratio as int) from #data order by ratio asc
open maxmatchcount
fetch next from maxmatchcount into @matchcount
while @@FETCH_STATUS = 0 and @matchsum <= @targetgroupsize
begin
set @maxmatches = @maxmatches + 1
set @matchsum = @matchsum + @matchcount
fetch next from maxmatchcount into @matchcount
end
close maxmatchcount
deallocate maxmatchcount
exec #match @targetgroupsize, @maxmatches, null, null
declare @output_group_pairs table (groupid int, pairid int)
insert @output_group_pairs select pairid, id from #pairsets where pairid <= @ngroups
-- output groups and their stats
select GroupId, Id from @output_group_pairs a join #data_pairs b on a.pairid = b.pairid order by 1, 2
select a.GroupId, AVG(c.ratio) as [Average] , count(*) as [Count]
from @output_group_pairs a
join #data_pairs b on a.pairid = b.pairid
join t_stats c on b.id = c.id
group by a.groupid
go
drop table #data
drop table #data_pairs
drop table #pairsets
drop table #pairseed
drop proc #match