Question

要点：我有一个函数，我想通过curve_fit分段，但其部分没有一个干净的分析形式。我通过包含一个有点janky for循环来完成这个过程，但它运行得非常慢：相对数据组的数据为1-2分钟，N = 10,000。我希望就如何（a）使用numpy广播来加速操作（没有for循环）或（b）做一些完全不同的事情以获得相同类型的结果，但更快

我正在使用的函数z=f(x,y; params)是y中的分段，单调递增，直到它达到f的最大值，在点y_0，并且然后饱和并变得不变。我的问题是断点y_0不是分析的，因此需要进行一些优化。这是相对容易的，除了我在x和y都有真实数据，并且断点是拟合参数c的函数。所有数据x，y和z都会产生工具噪音。

示例问题：以下功能已更改，以便更容易说明我尝试处理的问题。是的，我意识到它在分析上是可以解决的，但我真正的问题是不是。

f(x,y; c) = 
    y*(x-c*y),    for y <= x/(2*c)
    x**2/(4*c),   for y >  x/(2*c)

通过获取y_0 = x/(2*c) WRT f的导数并求解最大值，可以找到断点y。将f_max=x**2/(4*c)放回y_0可找到最大f。问题是断点取决于x - 值和拟合参数c，所以我不能计算内部循环之外的断点。

代码我已将点数减少到~500点，以允许代码在合理的时间内运行。我的真实数据有> 10,000点。

import numpy as np
from scipy.optimize import curve_fit,fminbound
import matplotlib.pyplot as plt

def function((x,y),c=1):

    fxn_xy = lambda x,y: y*(x-c*y)

    y_max = np.zeros(len(x))    #create array in which to put max values of y
    fxn_max = np.zeros(len(x))  #array to put the results 

    '''This loop is the first part I'd like to optimize, since it requires 
    O(1/delta) time for each individual pass through the fitting function'''

    for i,X in enumerate(x):
        '''X represents specific value of x to solve for'''        

        fxn_y = lambda y: fxn_xy(X,y)  
        #reduce function to just one variable (y)
        #by inputting given X value for the loop

        max_y = fminbound(lambda Y: -fxn_y(Y), 0, X, full_output=True)
        y_max[i] = max_y[0]
        fxn_max[i] = -max_y[1]

    return np.where(y<=y_max,
        fxn_xy(x,y),
        fxn_max
        )


'''  Create and plot 'real' data   '''
delta = 0.01
xs = [0.5,1.,1.5,2.]  #num copies of each X value

y = []

#create repeated x for each xs value.  +1 used to match size of y, below
x = np.hstack([X]*(int(X//delta+1)) for X in xs)
#create sweep from 1 to the current value of x, with spacing=delta
y = np.hstack([np.arange(0, X, delta) for X in xs]) 
z = function((x,y),c=0.75)

#introduce random instrumentation noise to x,y,z
x += np.random.normal(0,0.005,size=len(x))
y += np.random.normal(0,0.005,size=len(y))  
z += np.random.normal(0,0.05,size=len(z))


fig = plt.figure(1,(12,8))
axis1 = fig.add_subplot(121)
axis2 = fig.add_subplot(122)

axis1.scatter(y,x,s=1)
#axis1.plot(x)
#axis1.plot(z)
axis1.set_ylabel("x value")
axis1.set_xlabel("y value")

axis2.scatter(y,z, s=1)
axis2.set_xlabel("y value")
axis2.set_ylabel("z(x,y)")



'''Curve Fitting process to find optimal c'''
popt, pcov = curve_fit(function, (x,y),z,bounds=(0,2))
axis2.scatter(y, function((x,y),*popt), s=0.5, c='r')

print "c_est = {:.3g} ".format(popt[0])

结果如下图所示，＆＃34;真实＆＃34; x，y，z值（蓝色）和拟合值（红色）。

注意：我的直觉是弄清楚如何广播x变量s.t.我可以在fminbound中使用它。但这可能是天真的。想法？

谢谢大家！

编辑：澄清一下，x - 值并不总是固定在这样的组中，而是可以在y - 值保持稳定的情况下扫描。不幸的是，为什么我需要一些方法来处理x这么多次。

Answer 1

有几件事可以优化。一个问题是数据结构。如果我正确理解了代码，则会查找所有max的{{1}}。但是，您使结构使得相同的值一遍又一遍地重复。因此，在这里你浪费了大量的计算工作。

我不确定x的评估实际上有多困难，但我认为它并不比优化成本高得多。因此，在我的解决方案中，我只计算完整数组，寻找最大值，然后更改之后的值。

我想我的代码也可以进行优化，但现在它看起来像：

提供

import numpy as np
from scipy.optimize import leastsq
import matplotlib.pyplot as plt


def function( yArray, x=1, c=1 ):
    out = np.fromiter( ( y * ( x - c * y ) for y in yArray ), np.float )
    pos = np.argmax( out )
    outMax = out[ pos ]
    return np.fromiter( ( x if i < pos else outMax for i, x in enumerate( out ) ), np.float )


def residuals( param, xArray, yList, zList ):
    c = param
    zListTheory = [ function( yArray, x=X, c=c ) for yArray, X in zip( yList, xArray ) ]
    diffList = [ zArray - zArrayTheory for zArray, zArrayTheory in zip( zList, zListTheory ) ]
    out = [ item  for diffArray in diffList for item in diffArray ]
    return out


'''  Create and plot 'real' data   '''
delta = 0.01
xArray = np.array( [ 0.5, 1., 1.5, 2. ] )  #keep this as parameter
#create sweep from 1 to the current value of x, with spacing=delta
yList = [ np.arange( 0, X, delta ) for X in xArray ] ## as list of arrays
zList = [ function( yArray, x=X, c=0.75 ) for yArray, X in zip( yList, xArray ) ]


fig = plt.figure( 1, ( 12, 8 ) )
ax = fig.add_subplot( 1, 1, 1 )
for y,z in zip( yList, zList ):
    ax.plot( y, z )

#introduce random instrumentation noise
yRList =[ yArray + np.random.normal( 0, 0.02, size=len( yArray ) ) for yArray in yList ]
zRList =[ zArray + np.random.normal( 0, 0.02, size=len( zArray ) ) for zArray in zList ]

ax.set_prop_cycle( None )
for y,z in zip( yRList, zRList ):
    ax.plot( y, z, marker='o', markersize=2, ls='' )

sol, cov, info, msg, ier = leastsq( residuals, x0=.9, args=( xArray, yRList, zRList ), full_output=True )
print "c_est = {:.3g} ".format( sol[0] )
plt.show()

使用原始图表和嘈杂的数据

在curve_fit中找到曲线的最小值

1 个答案: