我有一个这样的样本表:
CREATE TABLE #TEMP(Category VARCHAR(100), Name VARCHAR(100))
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Lisa')
INSERT INTO #TEMP VALUES('A', 'Lisa')
INSERT INTO #TEMP VALUES('A', 'Bucky')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Ross')
INSERT INTO #TEMP VALUES('B', 'Ross')
INSERT INTO #TEMP VALUES('B', 'Ross')
SELECT Category, Name, COUNT(Name) Total
FROM #TEMP
GROUP BY Category, Name
ORDER BY Category, Total DESC
DROP TABLE #TEMP
给我以下内容:
A John 6
A Adam 4
A Lisa 2
A Bucky 1
B Lily 5
B Tom 4
B Ross 3
现在,如何从每个类别中选择TOP 5 PERCENT
个记录,假设每个类别的记录超过100个(此处未在示例表中显示)?例如,在我的实际表格中,它应该根据John
从A
和Lily
记录中移除B
记录(同样,我没有在此处显示完整的表格)得到:
A Adam 4
A Lisa 2
A Bucky 1
B Tom 4
B Ross 3
我一直在尝试使用CTE
和PARTITION BY
条款,但似乎无法达到我想要的效果。它从总体结果中删除了TOP 5 PERCENT,但不从每个类别中删除。有什么建议吗?
答案 0 :(得分:16)
您可以使用与NTILE
窗口函数配对的CTE(公用表表达式) - 这会将您的数据切片为您需要的多个切片,例如:在你的情况下,分为20片(每片5%)。
;WITH SlicedData AS
(
SELECT Category, Name, COUNT(Name) Total,
NTILE(20) OVER(PARTITION BY Category ORDER BY COUNT(Name) DESC) AS 'NTile'
FROM #TEMP
GROUP BY Category, Name
)
SELECT *
FROM SlicedData
WHERE NTile > 1
这基本上按Category,Name
对您的数据进行分组,按其他方式排序(不确定COUNT(Name)
是否真的是您想要的东西),然后将其分成20块,每块代表5%您的数据分区。具有NTile = 1
的切片是前5%的切片 - 从CTE中选择时忽略它。
请参阅:
了解更多信息
答案 1 :(得分:1)
编辑:我添加了第二个解决方案
SELECT b.Id
,b.Category
,b.Name
,b.CategoryNameCount
FROM
(
SELECT a.Id
,a.Category
,a.Name
,COUNT(*)OVER(PARTITION BY a.Category, a.Name) CategoryNameCount
,COUNT(*)OVER(PARTITION BY a.Category) CategoryCount
FROM #TEMP a
) b
WHERE b.CategoryCount*5.0/100 > b.CategoryCount*b.CategoryNameCount*1.0/100
ORDER BY b.Category, b.CategoryNameCount DESC, b.Name
结果:
Id Category Name CategoryNameCount
----------- -------- ---------- -----------------
7 A Adam 4
8 A Adam 4
9 A Adam 4
10 A Adam 4
11 A Lisa 2
12 A Lisa 2
13 A Bucky 1
19 B Tom 4
20 B Tom 4
21 B Tom 4
22 B Tom 4
23 B Ross 3
24 B Ross 3
25 B Ross 3
或
SELECT b.Category, b.Name, b.CategoryNameCount
FROM
(
SELECT
a.Category
,a.Name
,COUNT(*)OVER(PARTITION BY a.Category, a.Name) CategoryNameCount
,COUNT(*)OVER(PARTITION BY a.Category) CategoryCount
FROM #TEMP a
) b
WHERE b.CategoryCount*5.0/100 > b.CategoryCount*b.CategoryNameCount*1.0/100
GROUP BY b.Category, b.Name, b.CategoryNameCount
ORDER BY b.Category, b.CategoryNameCount DESC, b.Name
结果:
Category Name CategoryNameCount
-------- ---------- -----------------
A Adam 4
A Lisa 2
A Bucky 1
B Tom 4
B Ross 3
答案 2 :(得分:1)
select Category,name,CountTotal,RankSeq,(50*CountTotal)/100 from (
select Category,name,COUNT(*)
over (partition by Category,name ) as CountTotal,
ROW_NUMBER()
over (partition by Category,name order by Category) RankSeq from #TEMP
--group by Category,Name
) temp
where RankSeq <= ((50*CountTotal)/100)
order by Category,Name,RankSeq
<强>输出:强>
Category name CountTotal RankSeq 50*CountTotal)/100
A Adam 4 1 2
A Adam 4 2 2
A John 6 1 3
A John 6 2 3
A John 6 3 3
A Lisa 2 1 1
B Lily 5 1 2
B Lily 5 2 2
B Ross 3 1 1
B Tom 4 1 2
B Tom 4 2 2
我希望这会有所帮助:)
答案 3 :(得分:0)
;WITH SlicedData AS
(
SELECT Category, Name, COUNT(Name) Total,
**PERCENT_RANK() OVER(PARTITION BY Category ORDER BY COUNT(Name) DESC) * 100** AS 'Percent'
FROM #TEMP
GROUP BY Category, Name
)
SELECT *
FROM SlicedData
WHERE Percent < 5
如果记录数小于您的图块编号,则NTile将无效。