Question

我有一组包含在-360和360度之间的点。我目前正试图在不拆开数据集的情况下通过它们拟合一条线。有没有办法改变scikit的LinearRegression模型？否则，编写一个能够解释数据模型中包装的线拟合算法的最佳方法是什么？

Answer 1

这是一个非常有趣的问题，因为您只有一个功能作为输入，不包含有关包装的信息。想到的最简单的方法就是使用最近邻法

from sklearn.neighbors import KNeighborsRegressor
import numpy as np

####################
# Create some data
n_points = 100
X = np.linspace(0, 1, n_points) - 0.3
y = (X*720*2 % 720) - 360
y = y + np.random.normal(0, 15, n_points)
X = X.reshape(-1, 1)
#######################

knn = KNeighborsRegressor()
knn.fit(X, y)

lspace = np.linspace(0, 1, 1000) - 0.3
lspace = lspace.reshape(-1, 1)
plt.scatter(X, y)
plt.plot(lspace, svr.predict(lspace), color='C1')

但是，如果你需要它是分段线性的，那么我建议你看看this blog post

Answer 2

在有趣的噪音水平下，可能无法避免蛮力。

以下是三个模型的平方误差（使用环绕距离）作为斜率的函数（在每个点选择最佳截距），噪声水平为90,180,180和64,96,128个数据点（参见下面的脚本）。

我不确定是否有一种可靠的方法可靠地找到那些全局最小值。

OTOH，即使在看起来相当困难的情况下，蛮力工作也相当不错，就像最底层的那样。虚线是没有噪声的真实模型，点是通过向真实模型添加噪声而生成的实际数据，实线是重建。

代码：

import numpy as np
import scipy.optimize as so
from operator import attrgetter
from matplotlib import pylab

def setup(interc, slope, sigma, N):
    x = np.random.uniform(0.1, 2.0, (N,)).cumsum()
    y = (interc + x*slope + np.random.normal(0, sigma, (N,)) + 360) % 720 - 360
    return x, y

def err_model_full(params, x, y):
    interc, slope = params
    err = (interc + x*slope - y + 360) % 720 - 360
    return np.dot(err, err)

def err_model(interc, slope, x, y):
    err = (interc + x*slope - y + 360) % 720 - 360
    return np.dot(err, err)

for i, (interc, slope, sigma, N) in enumerate([(100, -12, 90, 64),
                                               (-30, 20, 180, 96),
                                               (66, -49, 180, 128)]):

    # create problem
    x, y = setup(interc, slope, sigma, N)

    # brute force through slopes
    slps = np.linspace(-128, 128, 257)
    ics, err = zip(*map(attrgetter('x', 'fun'), (so.minimize(err_model, (0,), args = (sl, x, y)) for sl in slps)))
    best = np.argmin(err)
    # polish
    res = so.minimize(err_model_full, (ics[best], slps[best]), args = (x, y))

    # plot

    pylab.figure(1)
    pylab.subplot(3, 1, i+1)
    pylab.plot(slps, err)
    pylab.figure(2)
    pylab.subplot(3, 1, i+1)
    pylab.plot(x, y, 'o')
    ic_rec, sl_rec = res.x
    pylab.plot(x, (ic_rec + x*sl_rec + 360) % 720 - 360)
    pylab.plot(x, (interc + x*slope + 360) % 720 - 360, '--')

    print('true (intercept, slope)', (interc, slope), 'reconstructed',
          tuple(res.x))
    print('noise level', sigma)
    print('squared error for true params', err_model_full((interc, slope), x, y))
    print('squared error for reconstructed params', err_model_full(res.x, x, y))
pylab.figure(1)
pylab.savefig('bf.png')
pylab.figure(2)
pylab.savefig('recon.png')

适合包裹线

2 个答案: