我已经在SQL Server中开发了简单线性回归函数(https://ask.sqlservercentral.com/questions/96778/can-this-linear-regression-algorithm-for-sql-serve.html)来计算Alpha,Beta和一些额外的值,如Upper 95%和Lower 95%。 简单线性回归将参数作为X和y。
现在我需要执行多元线性回归SQL Server,它接受参数y和X1,X2,X3,...... Xn
因此输出将如下:
Coefficients Standard Error t Stat P-value Lower 95% Upper 95%
+-------------------------------------------------------------------------------------------+
Intercept -23.94650812 19.85250194 -1.20622117 0.351059563 -109.3649298
X Variable 1 0.201064291 0.119759437 1.678901439 0.235179 -0.314218977
X Variable 2 -0.014046021 0.037366638 -0.375897368 0.743119791 -0.174821687
X Variable 3 0.502074905 0.295848189 1.697069389 0.231776287 -0.770857111
X Variable 4 0.068238344 0.219256527 0.311226057 0.785072958 -0.875146351
任何人都可以建议我实现这一目标的好方法。
答案 0 :(得分:7)
我会考虑使用CLR integration来利用支持线性回归的现有.NET库,例如Math.NET Numerics。使用CLR存储过程,您将能够从表中读取数据,将其转换为.NET库矩阵类型,运行回归,然后将结果写回表或直接返回行集。
但只是为了好玩,Linear Least Squares通过Orthogonal Decomposition在SQL中使用Householder reflections解决了。 (对于任何大量数据,警告都会缓慢运行。)
-- Create a type to repsent a 2D Matrix
CREATE TYPE dbo.Matrix AS TABLE (i int, j int, Aij float, PRIMARY KEY (i, j))
GO
-- Function to perform QR factorisation ie A -> QR
CREATE FUNCTION dbo.QRDecomposition (
@matrix dbo.Matrix READONLY
)
RETURNS @result TABLE (matrix char(1), i int, j int, Aij float)
AS
BEGIN
DECLARE @m int, @n int, @i int, @j int, @a float
SELECT @m = MAX(i), @n = MAX(j)
FROM @matrix
SET @i = 1
SET @j = 1
DECLARE @R dbo.Matrix
DECLARE @Qj dbo.Matrix
DECLARE @Q dbo.Matrix
-- Generate a @m by @m Identity Matrix to transform to Q, add more numbers for m > 1000
;WITH e1(n) AS
(
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
),
e2(n) AS (SELECT 1 FROM e1 CROSS JOIN e1 AS b),
e3(n) AS (SELECT 1 FROM e1 CROSS JOIN e2),
numbers(n) AS (SELECT ROW_NUMBER() OVER (ORDER BY n) FROM e3)
INSERT INTO @Q (i, j, Aij)
SELECT i.n, j.n, CASE WHEN i.n = j.n THEN 1 ELSE 0 END
FROM numbers i
CROSS JOIN numbers j
WHERE i.n <= @m AND j.n <= @m
-- Copy input matrix to be transformed to R
INSERT @R (i, j, Aij)
SELECT i, j, Aij
FROM @matrix
-- Loop performing Householder reflections
WHILE @j < @n OR (@j = @n AND @m > @n) BEGIN
SELECT @a = SQRT(SUM(Aij * Aij))
FROM @R
WHERE j = @j
AND i >= @i
SELECT @a = -SIGN(Aij) * @a
FROM @R
WHERE j = @j AND i = @j + (@j - 1)
;WITH u (i, j, Aij) AS (
SELECT i, 1, u.ui
FROM (
SELECT i, CASE WHEN i = j THEN Aij + @a ELSE Aij END AS ui
FROM @R
WHERE j = @j
AND i >= @i
) u
)
INSERT @Qj (i, j, Aij)
SELECT i, j, CASE WHEN i = j THEN 1 - 2 * Aij ELSE - 2 * Aij END as Aij
FROM (
SELECT u.i, ut.i AS j, u.Aij * ut.Aij / (SELECT SUM(Aij * Aij) FROM u) AS Aij
FROM u u
CROSS JOIN u ut
) vvt
-- Apply inverse Householder reflection to Q
UPDATE Qj
SET Aij = [Qj+1].Aij
FROM @Q Qj
INNER JOIN (
SELECT Q.i, QjT.j, SUM(QjT.Aij * Q.Aij) AS Aij
FROM @Q Q
INNER JOIN (
SELECT i AS j, j AS i, Aij
FROM @Qj
) QjT ON QjT.i = Q.j
GROUP BY Q.i, QjT.j
) [Qj+1] ON [Qj+1].i = Qj.i AND [Qj+1].j = Qj.j
-- Apply Householder reflections to R
UPDATE Rj
SET Aij = [Rj+1].Aij
FROM @R Rj
INNER JOIN (
SELECT Qj.i, R.j, SUM(Qj.Aij * R.Aij) AS Aij
FROM @Qj Qj
INNER JOIN @R R ON R.i = Qj.j
GROUP BY Qj.i, R.j
) [Rj+1] ON [Rj+1].i = Rj.i AND [Rj+1].j = Rj.j
-- Prepare Qj for next Householder reflection
UPDATE @Qj
SET Aij = CASE WHEN i = j THEN 1 ELSE 0 END
WHERE i <= @j OR j <= @j
DELETE FROM @Qj WHERE i > @j AND j > @j
SET @j = @j + 1
SET @i = @i + 1
END
-- Output Q
INSERT @result (matrix, i, j, Aij)
SELECT 'Q', i, j, Aij
FROM @Q
-- Output R
INSERT @result (matrix, i, j, Aij)
SELECT 'R', i, j, Aij
FROM @R
RETURN
END
GO
-- Function to perform linear regression
CREATE FUNCTION dbo.MatrixLeastSquareRegression (
@X dbo.Matrix READONLY
, @y dbo.Matrix READONLY
)
RETURNS @b TABLE (i int, j int, Aij float)
AS
BEGIN
DECLARE @QR TABLE (matrix char(1), i int, j int, Aij float)
INSERT @QR(matrix, i, j, Aij)
SELECT matrix, i, j, Aij
FROM dbo.QRDecomposition(@X)
DECLARE @Qty dbo.Matrix
-- @Qty = Q'y
INSERT INTO @Qty(i, j, Aij)
SELECT a.j, b.j, SUM(a.Aij * b.Aij)
FROM @QR a
INNER JOIN @y b ON b.i = a.i
WHERE a.matrix = 'Q'
GROUP BY a.j, b.j
DECLARE @m int, @n int, @i int, @j int, @a float
SELECT @m = MAX(j)
FROM @QR R
WHERE R.matrix = 'R'
SET @i = @m
-- Solve Rb = Q'y via back substitution
WHILE @i > 0 BEGIN
INSERT @b (i, j, Aij)
SELECT R.i, 1, ( y.Aij - ISNULL(sumKnown.Aij, 0) ) / R.Aij
FROM @QR R
INNER JOIN @Qty y ON y.i = R.i
LEFT JOIN (
SELECT SUM(R.Aij * ISNULL(b.Aij, 0)) AS Aij
FROM @QR R
INNER JOIN @b b ON b.i = R.j
WHERE R.matrix = 'R'
AND R.i = @i
) sumKnown ON 1 = 1
WHERE R.matrix = 'R'
AND R.i = @i
AND R.j = @i
SET @i = @i - 1
END
RETURN
END
GO
以下是测试脚本/使用示例:
DECLARE @TestData TABLE (i int IDENTITY(1, 1), X1 float, X2 float, X3 float, X4 float, y float)
DECLARE @c float
DECLARE @b1 float
DECLARE @b2 float
DECLARE @b3 float
DECLARE @b4 float
-- bs are the target coefficiants
SET @c = RAND()
SET @b1 = 2 * RAND()
SET @b2 = 3 * RAND()
SET @b3 = 4 * RAND()
SET @b4 = 5 * RAND()
-- Generate some test data, calcualte y from c + Xb plus some noise: y = c + Xb + e
-- Note: Using RAND() for e is not nomrally ditributed noise as linear regression assumes, this will mess with the estimate of c
DECLARE @k int = 1
WHILE @k < 50 BEGIN
INSERT @TestData(X1, X2, X3, X4, y)
SELECT x1, x2, x3, x4, @c + x1 * @b1 + x2 * @b2 + x3 * @b3 + x4 * @b4 + 0.2 * RAND()
FROM (
SELECT RAND() AS x1, RAND() AS x2, RAND() AS x3, RAND() AS x4
) X
SET @k = @k + 1
END
-- Put our data into dbo.Matrix types
DECLARE @X dbo.Matrix
INSERT @X (i, j, Aij)
-- Extra column for constant
SELECT i, 1, 1
FROM @TestData
UNION
SELECT i, 2, X1
FROM @TestData
UNION
SELECT i, 3, X2
FROM @TestData
UNION
SELECT i, 4, X3
FROM @TestData
UNION
SELECT i, 5, X4
FROM @TestData
DECLARE @y dbo.Matrix
INSERT @y (i, j, Aij)
SELECT i, 1, y
FROM @TestData
-- Estimates for coefficient values
DECLARE @bhat dbo.Matrix
INSERT @bhat (i, j, Aij)
SELECT i, j, Aij
FROM dbo.MatrixLeastSquareRegression(@X, @y)
SELECT CASE i
WHEN 1 THEN @c
WHEN 2 THEN @b1
WHEN 3 THEN @b2
WHEN 4 THEN @b3
WHEN 5 THEN @b4
END AS b
, Aij AS best
FROM @bhat
SELECT y.Aij AS y, Xb.Aij AS yest
FROM (
SELECT x.i, SUM(x.Aij * bh.Aij) AS Aij
FROM @X x
INNER JOIN @bhat bh ON bh.i = x.j
GROUP BY x.i
) Xb
INNER JOIN @y y ON y.i = Xb.i
SELECT SUM(SQUARE(y.Aij - Xb.Aij)) / COUNT(*) AS [Variance]
FROM (
SELECT x.i, SUM(x.Aij * bh.Aij) AS Aij
FROM @X x
INNER JOIN @bhat bh ON bh.i = x.j
GROUP BY x.i
) Xb
INNER JOIN @y y ON y.i = Xb.i
答案 1 :(得分:1)
虽然我赞扬编写可以执行各种高级统计计算的纯SQL函数的工作,但SQL并不是解决这些问题的最佳语言。
CLR绝对是一个选项(正如David Manning所建议的那样),与纯SQL相比,它很可能在这个特定问题上表现得更好。
另一种方法是使用统计语言。我建议R。它具有用于向SQL Server读取和写入数据的内置包,以及用于执行各种回归的多种功能。最重要的是:它是免费的! Here is an excellent introductory article开始使用R并对SQL Server 2012中的数据执行统计分析。
答案 2 :(得分:0)
为什么不使用来自Analysis Services的LinearRegression DataMining算法((虽然它自然是一个适合线性回归的决策树)?你只需要为它设计正确的挖掘Model。
提示:不需要OLAP多维数据集,您可以从关系表/视图
设计它Analysis Services functionaly包含在SQL Server及更高版本的标准版中 处理完模型后,您可以query使用类似SQL的语言,并检索回归函数,方差和其他有用的东西。