TSQL根据位置选择80%男性和20%女性人口

时间:2017-08-28 04:22:14

标签: sql sql-server tsql

我有一个包含样本人口数据的表。

这是带有架构和示例数据的SQLFiddle会话。 http://www.sqlfiddle.com/#!6/7e520/1

我想使用TSQL查询来获取具有以下条件的数据:

  1. 选定的样本人口应包含80%的男性和女性。 20%的女性

  2. 所选样本应基于City进行统一分发。这是为了确保所有城市都具有80-20%的男女比例(如果可能),并且整个样本不限于一个或两个大城市。与城市中的80-20%相比,总体上80-20%是主要要求,因为可能存在无法满足的城市。

  3. 选中的应该只包含年龄> gt的个人20和工资>万

  4. 我无法弄清楚如何遵守基于位置的80-20%分布。

2 个答案:

答案 0 :(得分:1)

逐步运行CTE-by-CTE下的查询并检查中间结果以了解其工作原理。

CTE_Male返回已应用过滤器Age > 20 and Salary > 10000的所有Male行的列表。 rnLocationCountLocation将用于以特定方式对这些行进行排序。我想逐个列出所有城市,然后再列出所有城市,然后再列出所有城市,依此类推。首先是最常见的城市。像这样:ABCABCABCABABABAAA。然后,我将占据这些行中的N%。

要获得所有男性行的80%,我使用NTILE(5)函数。 80%是5组中的4组。因此,主SELECT有一个过滤器tile <= 4。如果您需要选择不同的百分比调整NTILE参数。

相同的方法适用于女性行,但由于我们需要20%,因此过滤器为tile <= 1

示例数据

CREATE TABLE [dbo].[Random_Data](
    [ID] [varchar](50) NULL,
    [Type] [varchar](50) NULL,
    [LoDallastion] [varchar](50) NULL,
    [Age] [varchar](50) NULL,
    [Salary] [varchar](50) NULL
) ON [PRIMARY]

INSERT [dbo].[Random_Data] ([ID], [Type], [LoDallastion], [Age], [Salary]) VALUES 
 (N'1111', N'Male', N'Dallas', N'54', N'85421')
,(N'1112', N'Male', N'Dallas', N'18', N'164638')
,(N'1113', N'Male', N'Houston', N'35', N'155336')
,(N'1114', N'Male', N'Houston', N'24', N'50542')
,(N'1115', N'Male', N'Houston', N'26', N'155499')
,(N'1116', N'Male', N'Durham', N'24', N'31165')
,(N'1117', N'Male', N'Durham', N'15', N'90988')
,(N'1118', N'Male', N'Durham', N'39', N'150027')
,(N'1119', N'Female', N'Dallas', N'18', N'159713')
,(N'1120', N'Female', N'Dallas', N'23', N'62503')
,(N'1121', N'Female', N'Dallas', N'25', N'177185')
,(N'1122', N'Female', N'Dallas', N'15', N'193371')
,(N'1123', N'Female', N'Houston', N'74', N'31370')
,(N'1124', N'Female', N'Durham', N'46', N'97234')
,(N'1125', N'Female', N'Durham', N'53', N'182176')
,(N'1126', N'Female', N'Durham', N'28', N'148712')
,(N'1127', N'Female', N'Durham', N'42', N'176502')
,(N'1128', N'Female', N'Durham', N'64', N'62223')
,(N'1129', N'Female', N'Durham', N'75', N'189944')
,(N'1130', N'Male', N'San Jose', N'35', N'133312')
,(N'1131', N'Male', N'San Jose', N'63', N'95123')
,(N'1132', N'Male', N'San Jose', N'59', N'128996')
,(N'1133', N'Male', N'San Jose', N'34', N'92812')
,(N'1134', N'Male', N'San Jose', N'45', N'71514')
,(N'1135', N'Male', N'San Jose', N'55', N'59455')
,(N'1136', N'Female', N'San Jose', N'15', N'144925')
,(N'1137', N'Female', N'San Jose', N'39', N'96778')
,(N'1138', N'Female', N'San Jose', N'37', N'84800')
,(N'1139', N'Male', N'San Jose', N'71', N'188530')
,(N'1140', N'Male', N'San Jose', N'52', N'100521')
,(N'1141', N'Male', N'San Jose', N'38', N'78682')
,(N'1142', N'Male', N'San Jose', N'35', N'105433')
,(N'1143', N'Male', N'San Jose', N'37', N'37529')
,(N'1144', N'Male', N'San Jose', N'41', N'107591')
,(N'1145', N'Female', N'San Jose', N'75', N'94867')
,(N'1146', N'Female', N'San Jose', N'54', N'129194')
,(N'1147', N'Female', N'San Jose', N'65', N'64206')
,(N'1148', N'Male', N'San Jose', N'40', N'197250')
,(N'1149', N'Male', N'San Jose', N'20', N'131461')
,(N'1150', N'Male', N'San Jose', N'30', N'175853')
,(N'1151', N'Male', N'San Jose', N'27', N'171956')
,(N'1152', N'Male', N'San Jose', N'61', N'193986')
,(N'1153', N'Male', N'San Jose', N'25', N'107503')
,(N'1154', N'Female', N'Chica', N'28', N'57200');

<强>查询

WITH
CTE_Male
AS
(
    SELECT
        [ID]
        ,[Type]
        ,[LoDallastion]
        ,[Age]
        ,[Salary]
        ,ROW_NUMBER() OVER (PARTITION BY [LoDallastion] ORDER BY ID) AS rnLocation
        ,COUNT(*) OVER (PARTITION BY [LoDallastion]) AS CountLocation
    FROM
        [dbo].[Random_Data]
    WHERE
        [Type] = 'Male'
        AND [Age] > 20
        AND [Salary] > 10000
)
,CTE_MaleTiles
AS
(
    SELECT
        [ID]
        ,[Type]
        ,[LoDallastion]
        ,[Age]
        ,[Salary]
        ,CountLocation
        ,NTILE(5) OVER (ORDER BY rnLocation, CountLocation desc) AS tile
    FROM CTE_Male
)
,CTE_Female
AS
(
    SELECT
        [ID]
        ,[Type]
        ,[LoDallastion]
        ,[Age]
        ,[Salary]
        ,ROW_NUMBER() OVER (PARTITION BY [LoDallastion] ORDER BY ID) AS rnLocation
        ,COUNT(*) OVER (PARTITION BY [LoDallastion]) AS CountLocation
    FROM
        [dbo].[Random_Data]
    WHERE
        [Type] = 'Female'
        AND [Age] > 20
        AND [Salary] > 10000
)
,CTE_FemaleTiles
AS
(
    SELECT
        [ID]
        ,[Type]
        ,[LoDallastion]
        ,[Age]
        ,[Salary]
        ,CountLocation
        ,NTILE(5) OVER (ORDER BY rnLocation, CountLocation desc) AS tile
    FROM CTE_Female
)
SELECT 
    [ID]
    ,[Type]
    ,[LoDallastion]
    ,[Age]
    ,[Salary]
    ,CountLocation
FROM CTE_MaleTiles
WHERE tile <= 4

UNION ALL

SELECT 
    [ID]
    ,[Type]
    ,[LoDallastion]
    ,[Age]
    ,[Salary]
    ,CountLocation
FROM CTE_FemaleTiles
WHERE tile <= 1

ORDER BY [Type], CountLocation DESC, [LoDallastion];

<强>结果

+------+--------+--------------+-----+--------+---------------+
|  ID  |  Type  | LoDallastion | Age | Salary | CountLocation |
+------+--------+--------------+-----+--------+---------------+
| 1124 | Female | Durham       |  46 |  97234 |             6 |
| 1137 | Female | San Jose     |  39 |  96778 |             5 |
| 1120 | Female | Dallas       |  23 |  62503 |             2 |
| 1130 | Male   | San Jose     |  35 | 133312 |            17 |
| 1131 | Male   | San Jose     |  63 |  95123 |            17 |
| 1132 | Male   | San Jose     |  59 | 128996 |            17 |
| 1133 | Male   | San Jose     |  34 |  92812 |            17 |
| 1134 | Male   | San Jose     |  45 |  71514 |            17 |
| 1135 | Male   | San Jose     |  55 |  59455 |            17 |
| 1139 | Male   | San Jose     |  71 | 188530 |            17 |
| 1140 | Male   | San Jose     |  52 | 100521 |            17 |
| 1141 | Male   | San Jose     |  38 |  78682 |            17 |
| 1142 | Male   | San Jose     |  35 | 105433 |            17 |
| 1143 | Male   | San Jose     |  37 |  37529 |            17 |
| 1144 | Male   | San Jose     |  41 | 107591 |            17 |
| 1148 | Male   | San Jose     |  40 | 197250 |            17 |
| 1115 | Male   | Houston      |  26 | 155499 |             3 |
| 1114 | Male   | Houston      |  24 |  50542 |             3 |
| 1113 | Male   | Houston      |  35 | 155336 |             3 |
| 1116 | Male   | Durham       |  24 |  31165 |             2 |
| 1118 | Male   | Durham       |  39 | 150027 |             2 |
| 1111 | Male   | Dallas       |  54 |  85421 |             1 |
+------+--------+--------------+-----+--------+---------------+

Age > 20Salary > 10000的15行女性行。 15个中的20%是3.最终结果有3个女性行。

有23个男性行Age > 20Salary > 10000。 23%中有80%是18.4。最终结果有19个男性行。

最终结果包含尽可能多的城市的行。

第二个例子

INSERT [dbo].[Random_Data] ([ID], [Type], [LoDallastion], [Age], [Salary]) VALUES 
(N'1111', N'Male', N'A', N'54', N'85421'),
(N'1112', N'Male', N'B', N'54', N'85421'),
(N'1113', N'Male', N'C', N'54', N'85421'),
(N'1114', N'Male', N'D', N'54', N'85421'),
(N'1115', N'Male', N'E', N'54', N'85421'),
(N'1116', N'Male', N'F', N'54', N'85421'),
(N'1117', N'Male', N'G', N'54', N'85421'),
(N'1118', N'Male', N'H', N'54', N'85421'),
(N'1119', N'Male', N'I', N'54', N'85421'),
(N'1120', N'Male', N'J', N'54', N'85421'),
(N'2111', N'Female', N'A', N'54', N'85421'),
(N'2112', N'Female', N'B', N'54', N'85421'),
(N'2113', N'Female', N'C', N'54', N'85421'),
(N'2114', N'Female', N'D', N'54', N'85421'),
(N'2115', N'Female', N'E', N'54', N'85421'),
(N'2116', N'Female', N'F', N'54', N'85421'),
(N'2117', N'Female', N'G', N'54', N'85421'),
(N'2118', N'Female', N'H', N'54', N'85421'),
(N'2119', N'Female', N'I', N'54', N'85421'),
(N'2120', N'Female', N'J', N'54', N'85421');

每个城市有10个城市,其中一个男性,一个女性。 显然,对于任何给定的城市,不可能获得80-20分割,但下面查询的总体结果是80-20分割。

<强>结果

+------+--------+--------------+-----+--------+---------------+
|  ID  |  Type  | LoDallastion | Age | Salary | CountLocation |
+------+--------+--------------+-----+--------+---------------+
| 2111 | Female | A            |  54 |  85421 |             1 |
| 2112 | Female | B            |  54 |  85421 |             1 |
| 1111 | Male   | A            |  54 |  85421 |             1 |
| 1112 | Male   | B            |  54 |  85421 |             1 |
| 1113 | Male   | C            |  54 |  85421 |             1 |
| 1114 | Male   | D            |  54 |  85421 |             1 |
| 1115 | Male   | E            |  54 |  85421 |             1 |
| 1116 | Male   | F            |  54 |  85421 |             1 |
| 1117 | Male   | G            |  54 |  85421 |             1 |
| 1118 | Male   | H            |  54 |  85421 |             1 |
+------+--------+--------------+-----+--------+---------------+

答案 1 :(得分:0)

我使用AVG来获得均匀分布,但是你可以使用标准差(STDEV)&amp; AVG改善您的发行 -

Create table #Source  (
ID  int ,
Type  char(6) ,
Location Varchar(50) ,
Age tinyint,
Salary int
) 
go

insert into  #Source 
 select 1, 'Male' , 'Dallas' , 51,11000
 union all select 2, 'Male' , 'Dallas' , 42,8000
 union all select 3, 'Male' , 'Dallas' , 52,20000
 union all select 4, 'Male' , 'Houston' , 34,11000
 union all select 5, 'Male' , 'Houston' , 18,9000
 union all select 6, 'Male' , 'Houston' , 32,15000
 union all select 7, 'Male' , 'Houston' , 41,22000
 union all select 8, 'Male' , 'Houston' , 60,11000
 union all select 9, 'Male' , 'Durham' , 55,8000
 union all select 10, 'Male' , 'Durham' , 19,20000
 union all select 11, 'Male' , 'Durham' , 20,11000
 union all select 12, 'Male' , 'Durham' , 51,9000
 union all select 13, 'Female' , 'Dallas' , 42,15000
 union all select 14, 'Female' , 'Dallas' , 52,22000
 union all select 15, 'Female' , 'Dallas' , 34,11000
 union all select 16, 'Female' , 'Houston' , 18,8000
 union all select 17, 'Female' , 'Houston' , 32,20000
 union all select 18, 'Female' , 'Houston' , 41,11000
 union all select 19, 'Female' , 'Houston' , 60,9000
 union all select 20, 'Female' , 'Houston' , 55,15000
 union all select 21, 'Female' , 'Durham' , 19,22000
 union all select 22, 'Female' , 'Durham' , 20,11000
 union all select 23, 'Female' , 'Durham' , 55,8000


 select * , 
 ROW_NUMBER ()   over( partition by Location order by ID )   as RowNum_1 , -- This will be used to get uniform distribution based on location 
 ROW_NUMBER () over( partition by Location , Type order by Location , Type) RowNum_2, --  This will be used to get 80% Males & 20% females
 count (*)  over( partition by Location  ) * 0.8 as [80%] , -- value of 80% based on location 
 count (*)  over( partition by Location  ) * 0.2  as [20%] -- value of 20% based on location 
into #dest
 from #Source
 where Age>20 and Salary >10000
 order by Location , Type


 declare @AvgRowPerLocation int  
 select  @AvgRowPerLocation = COUNT(*)/count(distinct Location)  from #dest 

 select * from #dest
 where  RowNum_1 <= @AvgRowPerLocation   -- this will make sure we are getting almost equal row from each location ** you can remove this if you have small amount of data.
 and ( ( RowNum_2 <= [80%] and Type = 'Male' ) 
 or (RowNum_2 <= [20%] and Type = 'Female') )