在T-SQL中移动中位数,模式

时间:2015-04-09 15:31:50

标签: sql-server tsql window-functions

我正在使用SQL Server 2012,我知道计算移动平均线非常简单。 但我需要的是获得定义的窗口框架的模式和中位数(在当前行之前有一个2的窗口;月份唯一):

MONTH | CODE | MEDIAN | MODE
   1      0        0      0
   2      3        1.5    0
   3      2        2      0
   4      2        2      2
   5      2        2      2
   6      5        2      2
   7      3        3      2

如果有几个值符合模式,则选择第一个。

2 个答案:

答案 0 :(得分:2)

我彻底评论了我的代码。阅读我对模式计算的评论,让我知道它需要调整。总的来说,这是一个相对简单的查询。它只有很多丑陋的子查询,它有很多评论。看看:

DECLARE @Table TABLE ([Month] INT,[Code] INT);
INSERT INTO @Table
    VALUES  (1,0),
            (2,3),
            (3,2),
            (4,2), --Try commenting this out to test my special mode thingymajig
            (5,2),
            (6,5),
            (7,3);

WITH CTE
AS
(
SELECT  ROW_NUMBER() OVER (ORDER BY [Month]) row_num,
        [Month],
        CAST(Code AS FLOAT) Code
FROM @Table
)

SELECT [Month],
        Code,
        ISNULL((
                SELECT  CASE
                            --When there is only one previous value at row_num = 2, find Mean of first two codes
                            WHEN A.row_num = 2 THEN (LAG(B.code,1) OVER (ORDER BY [Code]) + B.Code)/2.0
                            --Else find middle code value of current and previous two rows
                            ELSE B.Code
                        END
                FROM CTE B 
                --How subquery relates to outer query
                WHERE B.row_num BETWEEN A.row_num - 2 AND A.row_num 
                ORDER BY B.[Code] 
                --Order by code and offset by 1 so don't select the lowest value, but fetch the one above the lowest value
                OFFSET 1 ROW FETCH NEXT 1 ROW ONLY),
        0) AS Median,
        --I did mode a little different
            --Instead of Avg(D.Code) you could list the values because with mode, 
                --If there's a tie with more than one of each number, you have multiple modes
                --Instead of doing that, I simply return the mean of the tied modes
                    --When there's one, it doesn't change anything.
                        --If you were to delete the month 4, then your number of Codes 2 and number of Codes 3 would be the same in the last row.
                        --Proper mode would be 2,3. I instead average them out to be 2.5.
        ISNULL((
                SELECT AVG(D.Code)
                FROM (
                    SELECT  C.Code,
                            COUNT(*) cnt,
                            DENSE_RANK() OVER (ORDER BY COUNT(*) DESC) dnse_rank
                    FROM CTE C
                    WHERE C.row_num <= A.row_num
                    GROUP BY C.Code
                    HAVING COUNT(*) > 1) D
                WHERE D.dnse_rank = 1),
        0) AS Mode
FROM CTE A

结果:

Month       Code                   Median                 Mode
----------- ---------------------- ---------------------- ----------------------
1           0                      0                      0
2           3                      1.5                    0
3           2                      2                      0
4           2                      2                      2
5           2                      2                      2
6           5                      2                      2
7           3                      3                      2

答案 1 :(得分:0)

如果我正确理解了您的要求,您的源表包含MONTH和CODE列,并且您想要计算MEDIAN和MODE。

下面的查询使用移动窗口&lt; =超过3个月(“当前行之前的2”)计算MEDIAN和MODE,并返回与您的示例匹配的结果。

-----------------------------------------------------
--Demo data
-----------------------------------------------------

CREATE TABLE #Data(
    [Month] INT NOT NULL,
    [Code] INT NOT NULL,
CONSTRAINT [PK_Data] PRIMARY KEY CLUSTERED 
(
    [Month] ASC
));

INSERT #Data
([Month],[Code])
VALUES
(1,0),
(2,3),
(3,2),
(4,2),
(5,2),
(6,5),
(7,3);

-----------------------------------------------------
--Query
-----------------------------------------------------

DECLARE @PrecedingRowsLimit INT = 2;

WITH [MPos] AS
(
    SELECT [R].[Month] 
        , [RB].[Month] AS [SubId]
        , [RB].[Code] 
        , ROW_NUMBER() OVER(PARTITION BY [R].[Month] ORDER BY [RB].[Code]) AS [RowNumberInPartition]
        , CASE 
            WHEN [R].[Count] % 2 = 1 THEN ([R].[Count] + 1) / 2
            ELSE NULL
            END AS [MedianPosition]
        , CASE 
            WHEN [R].[Count] % 2 = 0 THEN [R].[Count] / 2
            ELSE NULL
            END AS [MedianPosition1]
        , CASE 
            WHEN [R].[Count] % 2 = 0 THEN [R].[Count] / 2 + 1
            ELSE NULL
            END AS [MedianPosition2]
    FROM 
    (
        SELECT [RC].[Month] 
            , [RC].[RowNumber] 
            , CASE WHEN [RC].[Count] > @PrecedingRowsLimit + 1 THEN @PrecedingRowsLimit + 1 ELSE [RC].[Count] END AS [Count]
        FROM 
        (
            SELECT [Month] 
                , ROW_NUMBER() OVER(ORDER BY [Month]) AS [RowNumber]
                , ROW_NUMBER() OVER(ORDER BY [Month]) AS [Count]
            FROM #Data 
        ) [RC]
    ) [R]
        INNER JOIN #Data [RB]
        ON [R].[Month] >= [RB].[Month] 
            AND [RB].[Month] >= [R].[RowNumber] - @PrecedingRowsLimit
) 
SELECT DISTINCT [M].[Month] 
    , [ORIG].[Code] 
    , COALESCE([ME].[Code],([M1].[Code] + [M2].[Code]) / 2.0) AS [Median]
    , [MOD].[Mode] 
FROM [MPos] [M]
    LEFT JOIN [MPOS] [ME]
    ON [M].[Month] = [ME].[Month] 
        AND [M].[MedianPosition] = [ME].[RowNumberInPartition]
    LEFT JOIN [MPOS] [M1]
    ON [M].[Month] = [M1].[Month] 
        AND [M].[MedianPosition1] = [M1].[RowNumberInPartition]
    LEFT JOIN [MPOS] [M2]
    ON [M].[Month] = [M2].[Month] 
        AND [M].[MedianPosition2] = [M2].[RowNumberInPartition]
    INNER JOIN 
    (
        SELECT [MG].[Month] 
            , FIRST_VALUE([MG].[Code]) OVER (PARTITION BY [MG].[Month] ORDER BY [MG].[Count] DESC , [MG].[SubId] ASC) AS [Mode]
        FROM
        (
            SELECT [Month] , MIN([SubId]) AS [SubId], [Code]  , COUNT(1) AS [Count]
            FROM [MPOS] 
            GROUP BY [Month] , [Code] 
        ) [MG]
    ) [MOD]
    ON [M].[Month] = [MOD].[Month]
    INNER JOIN #Data [ORIG]
    ON [ORIG].[Month] = [M].[Month]
ORDER BY [M].[Month];