在Python中scipy.optimize.leastsq()
通常用于非线性回归。但是,leastsq()
原则上也应该与线性拟合函数一起使用。这似乎是一个简单的线性回归问题leastsq()
显然无法正确解决。数据符合y = mx行。
代码示例位于帖子的底部。当plot_real_data = False
时,随机生成100个线性相关数据点。这里leastsq()
可以有效地找到求和平方误差函数的最小值:
然而,当plot_real_data = True
时,则从真实数据集中获取100个数据点。在这里,leastsq()
不能,由于某些未知原因,找到求和平方误差函数的最小值:
leastsq()
始终报告最佳梯度参数m = 1.082,无论梯度的初始猜测如何。然而,m = 1.082并非全球最小值。适当的值更接近m = 1.25:
print sum(errorfunc([1.0], x, y))
3.9511006207
print sum(errorfunc([1.08], x, y))
3.59052114948
print sum(errorfunc([1.25], x, y))
3.37109033259(接近最低限度)
print sum(errorfunc([1.4], x, y))
3.79503789072
这是令人费解的行为。在这种情况下,求和平方误差函数是一个简单的二次方,并且没有局部最小值的风险。
我知道线性回归存在直接的方法,但leastsq()
对这个问题有任何想法吗?
Python 2.7.11 :: Anaconda 4.0.0(64位) Scipy版本0.17.0
CODE:
from __future__ import division
import matplotlib.pyplot as plt
import numpy
import random
from scipy.optimize import leastsq
def errorfunc(params, x_data, y_data) :
"""
Return error at each x point, to a straight line of gradient m
This 1-parameter error function has a clearly defined minimum
"""
squared_errors = []
for i, lm in enumerate(x_data) :
predicted_um = lm * params[0]
squared_errors.append((y_data[i] - predicted_um)**2)
return squared_errors
plt.figure()
###################################################################
# STEP 1: make a scatter plot of the data
plot_real_data = True
###################################################################
if plot_real_data :
# 100 points of real data
x = [0.85772, 0.17135, 0.03401, 0.17227, 0.17595, 0.1742, 0.22454, 0.32792, 0.19036, 0.17109, 0.16936, 0.17357, 0.6841, 0.24588, 0.22913, 0.28291, 0.19845, 0.3324, 0.66254, 0.1766, 0.47927, 0.47999, 0.50301, 0.16035, 0.65964, 0.0, 0.14308, 0.11648, 0.10936, 0.1983, 0.13352, 0.12471, 0.29475, 0.25212, 0.08334, 0.07697, 0.82263, 0.28078, 0.24192, 0.25383, 0.26707, 0.26457, 0.0, 0.24843, 0.26504, 0.24486, 0.0, 0.23914, 0.76646, 0.66567, 0.62966, 0.61771, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.79157, 0.06889, 0.07669, 0.1372, 0.11681, 0.11103, 0.13577, 0.07543, 0.10636, 0.09176, 0.10941, 0.08327, 1.19903, 0.20987, 0.21103, 0.21354, 0.26011, 0.28862, 0.28441, 0.2424, 0.29196, 0.20248, 0.1887, 0.20045, 1.2041, 0.20687, 0.22448, 0.23296, 0.25434, 0.25832, 0.25722, 0.24378, 0.24035, 0.17912, 0.18058, 0.13556, 0.97535, 0.25504, 0.20418, 0.22241]
y = [1.13085, 0.19213, 0.01827, 0.20984, 0.21898, 0.12174, 0.38204, 0.31002, 0.26701, 0.2759, 0.26018, 0.24712, 1.18352, 0.29847, 0.30622, 0.5195, 0.30406, 0.30653, 1.13126, 0.24761, 0.81852, 0.79863, 0.89171, 0.19251, 1.33257, 0.0, 0.19127, 0.13966, 0.15877, 0.19266, 0.12997, 0.13133, 0.25609, 0.43468, 0.09598, 0.08923, 1.49033, 0.27278, 0.3515, 0.38368, 0.35134, 0.37048, 0.0, 0.3566, 0.36296, 0.35054, 0.0, 0.32712, 1.23759, 1.02589, 1.02413, 0.9863, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.19224, 0.12192, 0.12815, 0.2672, 0.21856, 0.14736, 0.20143, 0.1452, 0.15965, 0.14342, 0.15828, 0.12247, 0.5728, 0.10603, 0.08939, 0.09194, 0.1145, 0.10313, 0.13377, 0.09734, 0.12124, 0.11429, 0.09536, 0.11457, 0.76803, 0.10173, 0.10005, 0.10541, 0.13734, 0.12192, 0.12619, 0.11325, 0.1092, 0.11844, 0.11373, 0.07865, 1.28568, 0.25871, 0.22843, 0.26608]
else :
# 100 points of test data with noise added
x_clean = numpy.linspace(0,1.2,100)
y_clean = [ i * 1.38 for i in x_clean ]
x = [ i + random.uniform(-1 * random.uniform(0, 0.1), random.uniform(0, 0.1)) for i in x_clean ]
y = [ i + random.uniform(-1 * random.uniform(0, 0.5), random.uniform(0, 0.5)) for i in y_clean ]
plt.subplot(2,1,1)
plt.scatter(x,y); plt.xlabel('x'); plt.ylabel('y')
# STEP 2: vary gradient m of a y = mx fitting line
# plot sum squared error with respect to gradient m
# here you can see by eye, the optimal gradient of the fitting line
plt.subplot(2,1,2)
try_m = numpy.linspace(0.1,4,200)
sse = [ sum(errorfunc([m], x, y)) for m in try_m ]
plt.plot(try_m,sse); plt.xlabel('line gradient, m'); plt.ylabel('sum-squared error')
# STEP 3: use leastsq() to find optimal gradient m
params = [2] # start with initial guess of 2 for gradient
params_fitted, cov, infodict, mesg, ier = leastsq(errorfunc, params[:], args=(x, y), full_output=1)
optimal_m = params_fitted[0]
print optimal_m
# optimal gradient m should be the minimum of the error function
plt.subplot(2,1,2)
plt.plot([optimal_m,optimal_m],[0,100], 'r')
# optimal gradient m should give best fit straight line
plt.subplot(2,1,1)
plt.plot([0, 1.2],[0, 1.2 * optimal_m],'r')
plt.show()