我是Python的新手。我有一个数据集,我正在尝试使用numPy / sciPy来预测/推断未来的数据点。有没有一种简单的方法来提出适合我当前数据的数学函数(比如,正弦函数),然后我可以将新值传递给该函数以获得我的预测?
这就是我所拥有的,但我不认为它正在做我想要的事情:
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
def main():
y = [8.3, 8.3, 8.3, 8.3, 7.2, 7.8, 7.8, 8.3, 9.4, 10.6, 10.0, 10.6, 11.1, 12.8,
12.8, 12.8, 11.7, 10.6, 10.6, 10.0, 10.0, 8.9, 8.9, 8.3, 7.2, 6.7, 6.7, 6.7,
7.2, 8.3, 7.2, 10.6, 11.1, 11.7, 12.8, 13.3, 15.0, 15.6, 13.3, 15.0, 13.3,
11.7, 11.1, 10.0, 10.6, 9.4, 8.9, 8.3, 8.9, 6.7, 6.7, 6.0, 6.1, 8.3, 8.3,
10.6, 11.1, 11.1, 11.7, 12.2, 13.3, 14.4, 16.7, 14.4, 13.3, 12.2, 11.7,
11.1, 10.0, 8.3, 7.8, 7.2, 8.0, 6.7, 7.2, 7.2, 7.8, 10.0, 12.2, 12.8,
12.8, 13.9, 15.0, 16.7, 16.7, 16.7, 15.6, 13.9, 12.8, 12.2, 10.6, 9.0,
8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 10.0, 10.6, 11.1, 12.0, 11.7,
11.1, 13.0, 13.3, 13.0, 11.1, 10.6, 10.6, 10.0, 10.0, 10.0, 9.4, 9.4,
8.9, 8.3, 9.0, 8.9, 9.4, 9.0, 9.4, 10.6, 11.7, 11.1, 11.7, 12.8, 12.8,
12.8, 13.0, 11.7, 10.6, 10.0, 10.0, 8.9, 9.4, 7.8, 7.8, 8.3, 7.8, 8.9,
8.9, 8.9, 9.4, 10.0, 10.0, 10.6, 11.0, 11.1, 11.1, 12.2, 10.6, 10.0, 8.9,
8.9, 9.0, 8.9, 8.3, 8.9, 8.9, 9.4, 9.4, 9.4, 8.9, 8.9, 8.9, 9.4, 10.0,
11.1, 11.7, 11.7, 11.7, 11.7, 12.0, 11.7, 11.7, 12.0, 11.7, 11.0, 10.6,
9.4, 10.0, 8.3, 8.0, 7.2, 5.6, 6.1, 5.6, 6.1, 6.7, 8.0, 10.0, 10.6, 11.1,
13.3, 12.8, 12.8, 12.2, 11.1, 10.0, 10.0, 10.0, 10.0, 9.4, 8.3]
x = np.array(np.arange(len(y)))
fitting_parameters, covariance = curve_fit(fit, x, y)
a = fitting_parameters[0]
b = fitting_parameters[1]
c = fitting_parameters[2]
d = fitting_parameters[3]
for x_predict in range(len(y) + 1, len(y) + 24):
next_x = x_predict
next_y = fit(next_x, a, b, c, d)
print("next_x: " + str(next_x))
print("next_y: " + str(next_y))
y.append(next_y)
plt.plot(y)
plt.show()
def fit(x, a, b, c, d):
return a*np.sin(b*x + c) + d
我尝试使用curve_fit和univariatespline我的数据,但这只适合我当前的数据并分别平滑我的点数。我的观点是,这些工具只是“适合”我的数据,但实际上并没有给我一个我可以用来获得未来积分的功能。
我认为我可以使用离散傅里叶变换,因为我的数据是周期性的,看起来可以被描述为正弦和余弦的总和。但是,一旦我从时域获得了一个频域,我就会陷入如何“推断”以预测未来时间段和时域中的点:
import numpy as np
import matplotlib.pyplot as plt
mydata = [8.3, 8.3, 8.3, 8.3, 7.2, 7.8, 7.8, 8.3, 9.4, 10.6, 10.0, 10.6, 11.1, 12.8,
12.8, 12.8, 11.7, 10.6, 10.6, 10.0, 10.0, 8.9, 8.9, 8.3, 7.2, 6.7, 6.7, 6.7,
7.2, 8.3, 7.2, 10.6, 11.1, 11.7, 12.8, 13.3, 15.0, 15.6, 13.3, 15.0, 13.3,
11.7, 11.1, 10.0, 10.6, 9.4, 8.9, 8.3, 8.9, 6.7, 6.7, 6.0, 6.1, 8.3, 8.3,
10.6, 11.1, 11.1, 11.7, 12.2, 13.3, 14.4, 16.7, 14.4, 13.3, 12.2, 11.7,
11.1, 10.0, 8.3, 7.8, 7.2, 8.0, 6.7, 7.2, 7.2, 7.8, 10.0, 12.2, 12.8,
12.8, 13.9, 15.0, 16.7, 16.7, 16.7, 15.6, 13.9, 12.8, 12.2, 10.6, 9.0,
8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 10.0, 10.6, 11.1, 12.0, 11.7,
11.1, 13.0, 13.3, 13.0, 11.1, 10.6, 10.6, 10.0, 10.0, 10.0, 9.4, 9.4,
8.9, 8.3, 9.0, 8.9, 9.4, 9.0, 9.4, 10.6, 11.7, 11.1, 11.7, 12.8, 12.8,
12.8, 13.0, 11.7, 10.6, 10.0, 10.0, 8.9, 9.4, 7.8, 7.8, 8.3, 7.8, 8.9,
8.9, 8.9, 9.4, 10.0, 10.0, 10.6, 11.0, 11.1, 11.1, 12.2, 10.6, 10.0, 8.9,
8.9, 9.0, 8.9, 8.3, 8.9, 8.9, 9.4, 9.4, 9.4, 8.9, 8.9, 8.9, 9.4, 10.0,
11.1, 11.7, 11.7, 11.7, 11.7, 12.0, 11.7, 11.7, 12.0, 11.7, 11.0, 10.6,
9.4, 10.0, 8.3, 8.0, 7.2, 5.6, 6.1, 5.6, 6.1, 6.7, 8.0, 10.0, 10.6, 11.1,
13.3, 12.8, 12.8, 12.2, 11.1, 10.0, 10.0, 10.0, 10.0, 9.4, 8.3]
sp = np.fft.rfft(mydata)
freq = np.fft.rfftfreq(len(mydata), d= 1.0)
plt.subplot(211)
plt.plot(mydata)
plt.subplot(212)
plt.plot(freq, sp, 'r')
plt.show()
我理解外推可能是危险且不可靠的,但就本项目而言,我只是想获得一个可以绘制的工作预测函数。
非常感谢您的帮助。
答案 0 :(得分:4)
这是一种通过将周期性数据表示为傅里叶级数进行插值的方法。傅里叶级数中使用的系数是通过离散FFT得到的。
我不推荐这个 - 你可以在下面看到插值并不是人们直觉上认为非常好的 - 但是因为我在评论中已经提到它,所以我会跟进并显示一些代码: )
import numpy as np
import matplotlib.pyplot as plt
import scipy.fftpack as fftpack
def fft_inverse(Yhat, x):
"""Based on http://stackoverflow.com/a/4452499/190597 (mtrw)"""
Yhat = np.asarray(Yhat)
x = np.asarray(x).reshape(-1, 1)
N = len(Yhat)
k = np.arange(N)
total = Yhat * np.exp(1j * x * k * 2 * np.pi / N)
return np.real(total.sum(axis=1))/N
mydata = [8.3, 8.3, 8.3, 8.3, 7.2, 7.8, 7.8, 8.3, 9.4, 10.6, 10.0, 10.6, 11.1, 12.8,
12.8, 12.8, 11.7, 10.6, 10.6, 10.0, 10.0, 8.9, 8.9, 8.3, 7.2, 6.7, 6.7, 6.7,
7.2, 8.3, 7.2, 10.6, 11.1, 11.7, 12.8, 13.3, 15.0, 15.6, 13.3, 15.0, 13.3,
11.7, 11.1, 10.0, 10.6, 9.4, 8.9, 8.3, 8.9, 6.7, 6.7, 6.0, 6.1, 8.3, 8.3,
10.6, 11.1, 11.1, 11.7, 12.2, 13.3, 14.4, 16.7, 14.4, 13.3, 12.2, 11.7,
11.1, 10.0, 8.3, 7.8, 7.2, 8.0, 6.7, 7.2, 7.2, 7.8, 10.0, 12.2, 12.8,
12.8, 13.9, 15.0, 16.7, 16.7, 16.7, 15.6, 13.9, 12.8, 12.2, 10.6, 9.0,
8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 10.0, 10.6, 11.1, 12.0, 11.7,
11.1, 13.0, 13.3, 13.0, 11.1, 10.6, 10.6, 10.0, 10.0, 10.0, 9.4, 9.4,
8.9, 8.3, 9.0, 8.9, 9.4, 9.0, 9.4, 10.6, 11.7, 11.1, 11.7, 12.8, 12.8,
12.8, 13.0, 11.7, 10.6, 10.0, 10.0, 8.9, 9.4, 7.8, 7.8, 8.3, 7.8, 8.9,
8.9, 8.9, 9.4, 10.0, 10.0, 10.6, 11.0, 11.1, 11.1, 12.2, 10.6, 10.0, 8.9,
8.9, 9.0, 8.9, 8.3, 8.9, 8.9, 9.4, 9.4, 9.4, 8.9, 8.9, 8.9, 9.4, 10.0,
11.1, 11.7, 11.7, 11.7, 11.7, 12.0, 11.7, 11.7, 12.0, 11.7, 11.0, 10.6,
9.4, 10.0, 8.3, 8.0, 7.2, 5.6, 6.1, 5.6, 6.1, 6.7, 8.0, 10.0, 10.6, 11.1,
13.3, 12.8, 12.8, 12.2, 11.1, 10.0, 10.0, 10.0, 10.0, 9.4, 8.3]
Yhat = fftpack.fft(mydata)
fig, ax = plt.subplots(nrows=2, sharex=True)
xs = np.arange(len(mydata))
ax[0].plot(xs, mydata)
new_xs = np.linspace(xs.min(), xs.max(), len(mydata)*1.5)
new_ys = fft_inverse(Yhat, new_xs)
ax[1].plot(new_xs, new_ys)
plt.xlim(xs.min(), xs.max())
plt.show()
以下是如何使用scipy.optimize
查找适合模型函数的参数,然后可以使用该函数在任意x坐标处进行插值。使用单个sin
的拟合仍然非常糟糕,但我会发布代码只是为了展示如何使用scipy.optimize
:
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as optimize
mydata = np.array(
[8.3, 8.3, 8.3, 8.3, 7.2, 7.8, 7.8, 8.3, 9.4, 10.6, 10.0, 10.6, 11.1, 12.8,
12.8, 12.8, 11.7, 10.6, 10.6, 10.0, 10.0, 8.9, 8.9, 8.3, 7.2, 6.7, 6.7, 6.7,
7.2, 8.3, 7.2, 10.6, 11.1, 11.7, 12.8, 13.3, 15.0, 15.6, 13.3, 15.0, 13.3,
11.7, 11.1, 10.0, 10.6, 9.4, 8.9, 8.3, 8.9, 6.7, 6.7, 6.0, 6.1, 8.3, 8.3,
10.6, 11.1, 11.1, 11.7, 12.2, 13.3, 14.4, 16.7, 14.4, 13.3, 12.2, 11.7,
11.1, 10.0, 8.3, 7.8, 7.2, 8.0, 6.7, 7.2, 7.2, 7.8, 10.0, 12.2, 12.8,
12.8, 13.9, 15.0, 16.7, 16.7, 16.7, 15.6, 13.9, 12.8, 12.2, 10.6, 9.0,
8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 8.9, 10.0, 10.6, 11.1, 12.0, 11.7,
11.1, 13.0, 13.3, 13.0, 11.1, 10.6, 10.6, 10.0, 10.0, 10.0, 9.4, 9.4,
8.9, 8.3, 9.0, 8.9, 9.4, 9.0, 9.4, 10.6, 11.7, 11.1, 11.7, 12.8, 12.8,
12.8, 13.0, 11.7, 10.6, 10.0, 10.0, 8.9, 9.4, 7.8, 7.8, 8.3, 7.8, 8.9,
8.9, 8.9, 9.4, 10.0, 10.0, 10.6, 11.0, 11.1, 11.1, 12.2, 10.6, 10.0, 8.9,
8.9, 9.0, 8.9, 8.3, 8.9, 8.9, 9.4, 9.4, 9.4, 8.9, 8.9, 8.9, 9.4, 10.0,
11.1, 11.7, 11.7, 11.7, 11.7, 12.0, 11.7, 11.7, 12.0, 11.7, 11.0, 10.6,
9.4, 10.0, 8.3, 8.0, 7.2, 5.6, 6.1, 5.6, 6.1, 6.7, 8.0, 10.0, 10.6, 11.1,
13.3, 12.8, 12.8, 12.2, 11.1, 10.0, 10.0, 10.0, 10.0, 9.4, 8.3])
def fit(x, a, b, c, d):
return a*np.sin(b*x + c) + d
xs = np.linspace(0, 2*np.pi, len(mydata))
guess = (mydata.ptp()/2, 10, 0, mydata.mean())
fitting_parameters, covariance = optimize.curve_fit(fit, xs, mydata, p0=guess)
a, b, c, d = fitting_parameters
print(a, b, c, d)
fig, ax = plt.subplots(nrows=2, sharex=True)
ax[0].plot(xs, mydata)
new_xs = np.linspace(xs.min(), xs.max(), len(mydata)*1.5)
new_ys = fit(new_xs, a, b, c, d)
ax[1].plot(new_xs, new_ys)
plt.xlim(xs.min(), xs.max())
plt.show()
您可以通过选择更好的模型函数(代替fit
)来改善拟合。选择什么是由先验的问题领域知识引导的创造力和直觉问题。更好的方法不仅取决于拟合的好坏,还取决于您希望模型的简单性或复杂性,以及/或应用于新数据集时的预测能力。