SQL Server中的多元线性回归函数

时间:2015-02-01 12:55:30

标签: sql sql-server regression

我已经在SQL Server中开发了简单线性回归函数(https://ask.sqlservercentral.com/questions/96778/can-this-linear-regression-algorithm-for-sql-serve.html)来计算Alpha,Beta和一些额外的值,如Upper 95%和Lower 95%。 简单线性回归将参数作为X和y。

现在我需要执行多元线性回归SQL Server,它接受参数y和X1,X2,X3,...... Xn

因此输出将如下:

    Coefficients    Standard Error    t Stat         P-value        Lower 95%     Upper 95%
+-------------------------------------------------------------------------------------------+
    Intercept       -23.94650812     19.85250194     -1.20622117    0.351059563 -109.3649298    
    X Variable 1    0.201064291      0.119759437     1.678901439    0.235179    -0.314218977    
    X Variable 2    -0.014046021     0.037366638     -0.375897368   0.743119791 -0.174821687
    X Variable 3    0.502074905      0.295848189     1.697069389    0.231776287 -0.770857111
    X Variable 4    0.068238344      0.219256527     0.311226057    0.785072958 -0.875146351

任何人都可以建议我实现这一目标的好方法。

3 个答案:

答案 0 :(得分:7)

我会考虑使用CLR integration来利用支持线性回归的现有.NET库,例如Math.NET Numerics。使用CLR存储过程,您将能够从表中读取数据,将其转换为.NET库矩阵类型,运行回归,然后将结果写回表或直接返回行集。

但只是为了好玩,Linear Least Squares通过Orthogonal Decomposition在SQL中使用Householder reflections解决了。 (对于任何大量数据,警告都会缓慢运行。)

-- Create a type to repsent a 2D Matrix

CREATE TYPE dbo.Matrix AS TABLE (i int, j int, Aij float, PRIMARY KEY (i, j))
GO

-- Function to perform QR factorisation ie A -> QR

CREATE FUNCTION dbo.QRDecomposition (
    @matrix dbo.Matrix READONLY
)
RETURNS @result TABLE (matrix char(1), i int, j int, Aij float)
AS
BEGIN

    DECLARE @m int, @n int, @i int, @j int, @a float

    SELECT @m = MAX(i), @n = MAX(j)
    FROM @matrix

    SET @i = 1
    SET @j = 1

    DECLARE @R dbo.Matrix
    DECLARE @Qj dbo.Matrix
    DECLARE @Q dbo.Matrix

    -- Generate a @m by @m Identity Matrix to transform to Q, add more numbers for m > 1000 
    ;WITH e1(n) AS
    (
        SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL 
        SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL 
        SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
    ),
    e2(n) AS (SELECT 1 FROM e1 CROSS JOIN e1 AS b),
    e3(n) AS (SELECT 1 FROM e1 CROSS JOIN e2),
    numbers(n) AS (SELECT ROW_NUMBER() OVER (ORDER BY n) FROM e3)
    INSERT INTO @Q (i, j, Aij)
    SELECT i.n, j.n, CASE WHEN i.n = j.n THEN 1 ELSE 0 END 
    FROM numbers i
    CROSS JOIN numbers j
    WHERE i.n <= @m AND j.n <= @m 

    -- Copy input matrix to be transformed to R
    INSERT @R (i, j, Aij)
    SELECT i, j, Aij
    FROM @matrix

    -- Loop performing Householder reflections
    WHILE @j < @n OR (@j = @n AND @m > @n)  BEGIN

        SELECT @a = SQRT(SUM(Aij * Aij))
        FROM @R
        WHERE j = @j
            AND i >= @i

        SELECT @a = -SIGN(Aij) * @a
        FROM @R
        WHERE j = @j AND i = @j + (@j - 1)

        ;WITH u (i, j, Aij) AS (
            SELECT i, 1, u.ui
            FROM (
                SELECT i, CASE WHEN i = j THEN Aij + @a ELSE Aij END AS ui
                FROM @R
                WHERE j = @j
                    AND i >= @i
            ) u
        )
        INSERT @Qj (i, j, Aij)
        SELECT i, j, CASE WHEN i = j THEN 1 - 2 * Aij ELSE - 2 * Aij END as Aij
        FROM (
            SELECT u.i, ut.i AS j, u.Aij * ut.Aij / (SELECT SUM(Aij * Aij) FROM u) AS Aij
            FROM u u
            CROSS JOIN u ut
        ) vvt

        -- Apply inverse Householder reflection to Q
        UPDATE Qj
            SET Aij = [Qj+1].Aij
        FROM @Q Qj
        INNER JOIN ( 
            SELECT Q.i, QjT.j, SUM(QjT.Aij * Q.Aij) AS Aij
            FROM @Q Q
            INNER JOIN (
                SELECT i AS j, j AS i, Aij
                FROM @Qj
            ) QjT ON QjT.i = Q.j 
            GROUP BY Q.i, QjT.j
        ) [Qj+1] ON [Qj+1].i = Qj.i AND [Qj+1].j = Qj.j

        -- Apply Householder reflections to R
        UPDATE Rj
            SET Aij = [Rj+1].Aij
        FROM @R Rj
        INNER JOIN ( 
            SELECT Qj.i, R.j, SUM(Qj.Aij * R.Aij) AS Aij
            FROM @Qj Qj
            INNER JOIN @R R ON R.i = Qj.j 
            GROUP BY Qj.i, R.j
        ) [Rj+1] ON [Rj+1].i = Rj.i AND [Rj+1].j = Rj.j

        -- Prepare Qj for next Householder reflection
        UPDATE @Qj
            SET Aij = CASE WHEN i = j THEN 1 ELSE 0 END
        WHERE i <= @j OR j <= @j

        DELETE FROM @Qj WHERE i > @j AND j > @j

        SET @j = @j + 1
        SET @i = @i + 1

    END

    -- Output Q
    INSERT @result (matrix, i, j, Aij)
    SELECT 'Q', i, j, Aij
    FROM @Q

    -- Output R
    INSERT @result (matrix, i, j, Aij)
    SELECT 'R', i, j, Aij
    FROM @R

    RETURN

END 
GO

-- Function to perform linear regression

CREATE FUNCTION dbo.MatrixLeastSquareRegression (
    @X dbo.Matrix READONLY
    , @y dbo.Matrix READONLY
)
RETURNS @b TABLE (i int, j int, Aij float)
AS
BEGIN

    DECLARE @QR TABLE (matrix char(1), i int, j int, Aij float)

    INSERT @QR(matrix, i, j, Aij)
    SELECT matrix, i, j, Aij
    FROM dbo.QRDecomposition(@X)

    DECLARE @Qty dbo.Matrix

    -- @Qty = Q'y
    INSERT INTO @Qty(i, j, Aij)
    SELECT a.j, b.j, SUM(a.Aij * b.Aij)
    FROM @QR a
    INNER JOIN @y b ON b.i = a.i
    WHERE a.matrix = 'Q'
    GROUP BY a.j, b.j

    DECLARE @m int, @n int, @i int, @j int, @a float

    SELECT @m = MAX(j)
    FROM @QR R
    WHERE R.matrix = 'R'

    SET @i = @m

    -- Solve Rb = Q'y via back substitution

    WHILE @i > 0 BEGIN

        INSERT @b (i, j, Aij)
        SELECT R.i, 1, ( y.Aij - ISNULL(sumKnown.Aij, 0) ) / R.Aij
        FROM @QR R
        INNER JOIN @Qty y ON y.i = R.i
        LEFT JOIN (
            SELECT SUM(R.Aij * ISNULL(b.Aij, 0)) AS Aij
            FROM @QR R
            INNER JOIN @b b ON b.i = R.j
            WHERE R.matrix = 'R' 
                AND R.i = @i
        ) sumKnown ON 1 = 1
        WHERE R.matrix = 'R' 
            AND R.i = @i
            AND R.j = @i

        SET @i = @i - 1

    END

    RETURN

END 
GO

以下是测试脚本/使用示例:

DECLARE @TestData TABLE (i int IDENTITY(1, 1), X1 float, X2 float, X3 float, X4 float, y float)

DECLARE @c float
DECLARE @b1 float
DECLARE @b2 float
DECLARE @b3 float
DECLARE @b4 float

-- bs are the target coefficiants

SET @c = RAND()
SET @b1 = 2 * RAND()
SET @b2 = 3 * RAND()
SET @b3 = 4 * RAND()
SET @b4 = 5 * RAND()

-- Generate some test data, calcualte y from c + Xb plus some noise: y = c + Xb + e
-- Note: Using RAND() for e is not nomrally ditributed noise as linear regression assumes, this will mess with the estimate of c

DECLARE @k int = 1

WHILE @k < 50 BEGIN

    INSERT @TestData(X1, X2, X3, X4, y)
    SELECT x1, x2, x3, x4, @c + x1 * @b1 + x2 * @b2 + x3 * @b3 + x4 * @b4 + 0.2 * RAND()
    FROM (
        SELECT RAND() AS x1, RAND() AS x2, RAND() AS x3, RAND() AS x4
    ) X

    SET @k = @k + 1

END

-- Put our data into dbo.Matrix types

DECLARE @X dbo.Matrix

INSERT @X (i, j, Aij)
-- Extra column for constant
SELECT i, 1, 1
FROM @TestData
UNION
SELECT i, 2, X1
FROM @TestData
UNION
SELECT i, 3, X2
FROM @TestData
UNION
SELECT i, 4, X3
FROM @TestData
UNION
SELECT i, 5, X4
FROM @TestData

DECLARE @y dbo.Matrix

INSERT @y (i, j, Aij)
SELECT i, 1, y
FROM @TestData

-- Estimates for coefficient values
DECLARE @bhat dbo.Matrix

INSERT @bhat (i, j, Aij)
SELECT i, j, Aij
FROM dbo.MatrixLeastSquareRegression(@X, @y)

SELECT CASE i
        WHEN 1 THEN @c
        WHEN 2 THEN @b1
        WHEN 3 THEN @b2
        WHEN 4 THEN @b3
        WHEN 5 THEN @b4
    END AS b
    , Aij AS best 
FROM @bhat

SELECT y.Aij AS y, Xb.Aij AS yest
FROM (
    SELECT x.i, SUM(x.Aij * bh.Aij) AS Aij
    FROM @X x
    INNER JOIN @bhat bh ON bh.i = x.j
    GROUP BY x.i
) Xb
INNER JOIN @y y ON y.i = Xb.i

SELECT SUM(SQUARE(y.Aij - Xb.Aij)) / COUNT(*) AS [Variance] 
FROM (
    SELECT x.i, SUM(x.Aij * bh.Aij) AS Aij
    FROM @X x
    INNER JOIN @bhat bh ON bh.i = x.j
    GROUP BY x.i
) Xb
INNER JOIN @y y ON y.i = Xb.i

答案 1 :(得分:1)

虽然我赞扬编写可以执行各种高级统计计算的纯SQL函数的工作,但SQL并不是解决这些问题的最佳语言。

CLR绝对是一个选项(正如David Manning所建议的那样),与纯SQL相比,它很可能在这个特定问题上表现得更好。

另一种方法是使用统计语言。我建议R。它具有用于向SQL Server读取和写入数据的内置包,以及用于执行各种回归的多种功能。最重要的是:它是免费的! Here is an excellent introductory article开始使用R并对SQL Server 2012中的数据执行统计分析。

答案 2 :(得分:0)

为什么不使用来自Analysis Services的LinearRegression DataMining算法((虽然它自然是一个适合线性回归的决策树)?你只需要为它设计正确的挖掘Model

提示:不需要OLAP多维数据集,您可以从关系表/视图

设计它

Analysis Services functionaly包含在SQL Server及更高版本的标准版中 处理完模型后,您可以query使用类似SQL的语言,并检索回归函数,方差和其他有用的东西。