我有一个包含样本人口数据的表。
这是带有架构和示例数据的SQLFiddle会话。 http://www.sqlfiddle.com/#!6/7e520/1
我想使用TSQL查询来获取具有以下条件的数据:
选定的样本人口应包含80%的男性和女性。 20%的女性
所选样本应基于City进行统一分发。这是为了确保所有城市都具有80-20%的男女比例(如果可能),并且整个样本不限于一个或两个大城市。与城市中的80-20%相比,总体上80-20%是主要要求,因为可能存在无法满足的城市。
选中的应该只包含年龄> gt的个人20和工资>万
我无法弄清楚如何遵守基于位置的80-20%分布。
答案 0 :(得分:1)
逐步运行CTE-by-CTE下的查询并检查中间结果以了解其工作原理。
CTE_Male
返回已应用过滤器Age > 20 and Salary > 10000
的所有Male行的列表。 rnLocation
和CountLocation
将用于以特定方式对这些行进行排序。我想逐个列出所有城市,然后再列出所有城市,然后再列出所有城市,依此类推。首先是最常见的城市。像这样:ABCABCABCABABABAAA
。然后,我将占据这些行中的N
%。
要获得所有男性行的80%,我使用NTILE(5)
函数。 80%是5组中的4组。因此,主SELECT
有一个过滤器tile <= 4
。如果您需要选择不同的百分比调整NTILE
参数。
相同的方法适用于女性行,但由于我们需要20%,因此过滤器为tile <= 1
。
示例数据
CREATE TABLE [dbo].[Random_Data](
[ID] [varchar](50) NULL,
[Type] [varchar](50) NULL,
[LoDallastion] [varchar](50) NULL,
[Age] [varchar](50) NULL,
[Salary] [varchar](50) NULL
) ON [PRIMARY]
INSERT [dbo].[Random_Data] ([ID], [Type], [LoDallastion], [Age], [Salary]) VALUES
(N'1111', N'Male', N'Dallas', N'54', N'85421')
,(N'1112', N'Male', N'Dallas', N'18', N'164638')
,(N'1113', N'Male', N'Houston', N'35', N'155336')
,(N'1114', N'Male', N'Houston', N'24', N'50542')
,(N'1115', N'Male', N'Houston', N'26', N'155499')
,(N'1116', N'Male', N'Durham', N'24', N'31165')
,(N'1117', N'Male', N'Durham', N'15', N'90988')
,(N'1118', N'Male', N'Durham', N'39', N'150027')
,(N'1119', N'Female', N'Dallas', N'18', N'159713')
,(N'1120', N'Female', N'Dallas', N'23', N'62503')
,(N'1121', N'Female', N'Dallas', N'25', N'177185')
,(N'1122', N'Female', N'Dallas', N'15', N'193371')
,(N'1123', N'Female', N'Houston', N'74', N'31370')
,(N'1124', N'Female', N'Durham', N'46', N'97234')
,(N'1125', N'Female', N'Durham', N'53', N'182176')
,(N'1126', N'Female', N'Durham', N'28', N'148712')
,(N'1127', N'Female', N'Durham', N'42', N'176502')
,(N'1128', N'Female', N'Durham', N'64', N'62223')
,(N'1129', N'Female', N'Durham', N'75', N'189944')
,(N'1130', N'Male', N'San Jose', N'35', N'133312')
,(N'1131', N'Male', N'San Jose', N'63', N'95123')
,(N'1132', N'Male', N'San Jose', N'59', N'128996')
,(N'1133', N'Male', N'San Jose', N'34', N'92812')
,(N'1134', N'Male', N'San Jose', N'45', N'71514')
,(N'1135', N'Male', N'San Jose', N'55', N'59455')
,(N'1136', N'Female', N'San Jose', N'15', N'144925')
,(N'1137', N'Female', N'San Jose', N'39', N'96778')
,(N'1138', N'Female', N'San Jose', N'37', N'84800')
,(N'1139', N'Male', N'San Jose', N'71', N'188530')
,(N'1140', N'Male', N'San Jose', N'52', N'100521')
,(N'1141', N'Male', N'San Jose', N'38', N'78682')
,(N'1142', N'Male', N'San Jose', N'35', N'105433')
,(N'1143', N'Male', N'San Jose', N'37', N'37529')
,(N'1144', N'Male', N'San Jose', N'41', N'107591')
,(N'1145', N'Female', N'San Jose', N'75', N'94867')
,(N'1146', N'Female', N'San Jose', N'54', N'129194')
,(N'1147', N'Female', N'San Jose', N'65', N'64206')
,(N'1148', N'Male', N'San Jose', N'40', N'197250')
,(N'1149', N'Male', N'San Jose', N'20', N'131461')
,(N'1150', N'Male', N'San Jose', N'30', N'175853')
,(N'1151', N'Male', N'San Jose', N'27', N'171956')
,(N'1152', N'Male', N'San Jose', N'61', N'193986')
,(N'1153', N'Male', N'San Jose', N'25', N'107503')
,(N'1154', N'Female', N'Chica', N'28', N'57200');
<强>查询强>
WITH
CTE_Male
AS
(
SELECT
[ID]
,[Type]
,[LoDallastion]
,[Age]
,[Salary]
,ROW_NUMBER() OVER (PARTITION BY [LoDallastion] ORDER BY ID) AS rnLocation
,COUNT(*) OVER (PARTITION BY [LoDallastion]) AS CountLocation
FROM
[dbo].[Random_Data]
WHERE
[Type] = 'Male'
AND [Age] > 20
AND [Salary] > 10000
)
,CTE_MaleTiles
AS
(
SELECT
[ID]
,[Type]
,[LoDallastion]
,[Age]
,[Salary]
,CountLocation
,NTILE(5) OVER (ORDER BY rnLocation, CountLocation desc) AS tile
FROM CTE_Male
)
,CTE_Female
AS
(
SELECT
[ID]
,[Type]
,[LoDallastion]
,[Age]
,[Salary]
,ROW_NUMBER() OVER (PARTITION BY [LoDallastion] ORDER BY ID) AS rnLocation
,COUNT(*) OVER (PARTITION BY [LoDallastion]) AS CountLocation
FROM
[dbo].[Random_Data]
WHERE
[Type] = 'Female'
AND [Age] > 20
AND [Salary] > 10000
)
,CTE_FemaleTiles
AS
(
SELECT
[ID]
,[Type]
,[LoDallastion]
,[Age]
,[Salary]
,CountLocation
,NTILE(5) OVER (ORDER BY rnLocation, CountLocation desc) AS tile
FROM CTE_Female
)
SELECT
[ID]
,[Type]
,[LoDallastion]
,[Age]
,[Salary]
,CountLocation
FROM CTE_MaleTiles
WHERE tile <= 4
UNION ALL
SELECT
[ID]
,[Type]
,[LoDallastion]
,[Age]
,[Salary]
,CountLocation
FROM CTE_FemaleTiles
WHERE tile <= 1
ORDER BY [Type], CountLocation DESC, [LoDallastion];
<强>结果强>
+------+--------+--------------+-----+--------+---------------+
| ID | Type | LoDallastion | Age | Salary | CountLocation |
+------+--------+--------------+-----+--------+---------------+
| 1124 | Female | Durham | 46 | 97234 | 6 |
| 1137 | Female | San Jose | 39 | 96778 | 5 |
| 1120 | Female | Dallas | 23 | 62503 | 2 |
| 1130 | Male | San Jose | 35 | 133312 | 17 |
| 1131 | Male | San Jose | 63 | 95123 | 17 |
| 1132 | Male | San Jose | 59 | 128996 | 17 |
| 1133 | Male | San Jose | 34 | 92812 | 17 |
| 1134 | Male | San Jose | 45 | 71514 | 17 |
| 1135 | Male | San Jose | 55 | 59455 | 17 |
| 1139 | Male | San Jose | 71 | 188530 | 17 |
| 1140 | Male | San Jose | 52 | 100521 | 17 |
| 1141 | Male | San Jose | 38 | 78682 | 17 |
| 1142 | Male | San Jose | 35 | 105433 | 17 |
| 1143 | Male | San Jose | 37 | 37529 | 17 |
| 1144 | Male | San Jose | 41 | 107591 | 17 |
| 1148 | Male | San Jose | 40 | 197250 | 17 |
| 1115 | Male | Houston | 26 | 155499 | 3 |
| 1114 | Male | Houston | 24 | 50542 | 3 |
| 1113 | Male | Houston | 35 | 155336 | 3 |
| 1116 | Male | Durham | 24 | 31165 | 2 |
| 1118 | Male | Durham | 39 | 150027 | 2 |
| 1111 | Male | Dallas | 54 | 85421 | 1 |
+------+--------+--------------+-----+--------+---------------+
有Age > 20
和Salary > 10000
的15行女性行。 15个中的20%是3.最终结果有3个女性行。
有23个男性行Age > 20
和Salary > 10000
。 23%中有80%是18.4。最终结果有19个男性行。
最终结果包含尽可能多的城市的行。
第二个例子
INSERT [dbo].[Random_Data] ([ID], [Type], [LoDallastion], [Age], [Salary]) VALUES
(N'1111', N'Male', N'A', N'54', N'85421'),
(N'1112', N'Male', N'B', N'54', N'85421'),
(N'1113', N'Male', N'C', N'54', N'85421'),
(N'1114', N'Male', N'D', N'54', N'85421'),
(N'1115', N'Male', N'E', N'54', N'85421'),
(N'1116', N'Male', N'F', N'54', N'85421'),
(N'1117', N'Male', N'G', N'54', N'85421'),
(N'1118', N'Male', N'H', N'54', N'85421'),
(N'1119', N'Male', N'I', N'54', N'85421'),
(N'1120', N'Male', N'J', N'54', N'85421'),
(N'2111', N'Female', N'A', N'54', N'85421'),
(N'2112', N'Female', N'B', N'54', N'85421'),
(N'2113', N'Female', N'C', N'54', N'85421'),
(N'2114', N'Female', N'D', N'54', N'85421'),
(N'2115', N'Female', N'E', N'54', N'85421'),
(N'2116', N'Female', N'F', N'54', N'85421'),
(N'2117', N'Female', N'G', N'54', N'85421'),
(N'2118', N'Female', N'H', N'54', N'85421'),
(N'2119', N'Female', N'I', N'54', N'85421'),
(N'2120', N'Female', N'J', N'54', N'85421');
每个城市有10个城市,其中一个男性,一个女性。 显然,对于任何给定的城市,不可能获得80-20分割,但下面查询的总体结果是80-20分割。
<强>结果强>
+------+--------+--------------+-----+--------+---------------+
| ID | Type | LoDallastion | Age | Salary | CountLocation |
+------+--------+--------------+-----+--------+---------------+
| 2111 | Female | A | 54 | 85421 | 1 |
| 2112 | Female | B | 54 | 85421 | 1 |
| 1111 | Male | A | 54 | 85421 | 1 |
| 1112 | Male | B | 54 | 85421 | 1 |
| 1113 | Male | C | 54 | 85421 | 1 |
| 1114 | Male | D | 54 | 85421 | 1 |
| 1115 | Male | E | 54 | 85421 | 1 |
| 1116 | Male | F | 54 | 85421 | 1 |
| 1117 | Male | G | 54 | 85421 | 1 |
| 1118 | Male | H | 54 | 85421 | 1 |
+------+--------+--------------+-----+--------+---------------+
答案 1 :(得分:0)
我使用AVG来获得均匀分布,但是你可以使用标准差(STDEV)&amp; AVG改善您的发行 -
Create table #Source (
ID int ,
Type char(6) ,
Location Varchar(50) ,
Age tinyint,
Salary int
)
go
insert into #Source
select 1, 'Male' , 'Dallas' , 51,11000
union all select 2, 'Male' , 'Dallas' , 42,8000
union all select 3, 'Male' , 'Dallas' , 52,20000
union all select 4, 'Male' , 'Houston' , 34,11000
union all select 5, 'Male' , 'Houston' , 18,9000
union all select 6, 'Male' , 'Houston' , 32,15000
union all select 7, 'Male' , 'Houston' , 41,22000
union all select 8, 'Male' , 'Houston' , 60,11000
union all select 9, 'Male' , 'Durham' , 55,8000
union all select 10, 'Male' , 'Durham' , 19,20000
union all select 11, 'Male' , 'Durham' , 20,11000
union all select 12, 'Male' , 'Durham' , 51,9000
union all select 13, 'Female' , 'Dallas' , 42,15000
union all select 14, 'Female' , 'Dallas' , 52,22000
union all select 15, 'Female' , 'Dallas' , 34,11000
union all select 16, 'Female' , 'Houston' , 18,8000
union all select 17, 'Female' , 'Houston' , 32,20000
union all select 18, 'Female' , 'Houston' , 41,11000
union all select 19, 'Female' , 'Houston' , 60,9000
union all select 20, 'Female' , 'Houston' , 55,15000
union all select 21, 'Female' , 'Durham' , 19,22000
union all select 22, 'Female' , 'Durham' , 20,11000
union all select 23, 'Female' , 'Durham' , 55,8000
select * ,
ROW_NUMBER () over( partition by Location order by ID ) as RowNum_1 , -- This will be used to get uniform distribution based on location
ROW_NUMBER () over( partition by Location , Type order by Location , Type) RowNum_2, -- This will be used to get 80% Males & 20% females
count (*) over( partition by Location ) * 0.8 as [80%] , -- value of 80% based on location
count (*) over( partition by Location ) * 0.2 as [20%] -- value of 20% based on location
into #dest
from #Source
where Age>20 and Salary >10000
order by Location , Type
declare @AvgRowPerLocation int
select @AvgRowPerLocation = COUNT(*)/count(distinct Location) from #dest
select * from #dest
where RowNum_1 <= @AvgRowPerLocation -- this will make sure we are getting almost equal row from each location ** you can remove this if you have small amount of data.
and ( ( RowNum_2 <= [80%] and Type = 'Male' )
or (RowNum_2 <= [20%] and Type = 'Female') )