Question

我必须开发一个数据侦听功能，在该功能中，我将熊猫数据框中的数据集分成较小的部分，然后在其中插入三次样条线（在我的情况下为Z值）。为此，我使用了interpolate.interp2d中的scipy函数，然后从估计的值中减去了给定的高度值以得到残差，在这里我可以应用3 sigma阈值，并删除离群值最高的行

为了更好地理解，这是我的数据集的3D图，其中包含异常值：

但是当我运行代码时，会收到以下警告：

/home/mattes/anaconda3/lib/python3.6/site-packages/scipy/interpolate
/_fitpack_impl.py:976: RuntimeWarning: The maximal number of iterations 
(20) allowed for finding smoothing
spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)
    kx,ky=3,3 nx,ny=18,21 m=915 fp=221.198171 s=0.000000
 warnings.warn(RuntimeWarning(_iermess2[ierm][0] + _mess))

/home/mattes/anaconda3/lib/python3.6/site-packages/scipy/interpolate
/_fitpack_impl.py:976: RuntimeWarning: A theoretically impossible result 
when finding a smoothing spline
with fp = s. Probable causes: s too small or badly chosen eps.
(abs(fp-s)/s>0.001)
    kx,ky=3,3 nx,ny=19,22 m=914 fp=209.480429 s=0.000000
warnings.warn(RuntimeWarning(_iermess2[ierm][0] + _mess))

/home/mattes/anaconda3/lib/python3.6/site-packages/scipy/interpolate
/_fitpack_impl.py:976: RuntimeWarning: No more knots can be added because 
the number of B-spline
coefficients already exceeds the number of data points m.
Probable causes: either s or m too small. (fp>s)
    kx,ky=3,3 nx,ny=26,46 m=911 fp=158.754387 s=0.000000
warnings.warn(RuntimeWarning(_iermess2[ierm][0] + _mess))

这是我创建的代码：

def DataSnooping_splines(df, sigma_size, step_size):


s = sigma_size
Orbit_No = df.index
df = df.assign(orbitNo = Orbit_No)
df = df.sort_values(by=['chainage_km'])
df.index = pd.RangeIndex(len(df.index))

start_pos = df['chainage_km'].values[0]
end_pos = int(df['chainage_km'].values[len(df['chainage_km'].values)-1])
start_points = np.arange(start_pos, end_pos, step_size)

for i in start_points:

    dfn = df.sort_values(by=['chainage_km'])
    dfn = dfn[(dfn['chainage_km']>= i) & (dfn['chainage_km']<=i+step_size)]
   # dfn.index = pd.RangeIndex(len(dfn.index))

    # extract the needed values from the dataframe:
    time = dfn['daytime'].values
    chainage = dfn['chainage_km'].values
    height = dfn['Surf1MinusEGM08'].values

    # calculate the parameters for the cubic spline:
    f = interpolate.interp2d(time, chainage, height, kind='cubic')

    # estimate the spline:
    height_spline = []

    for dt in range(len(time)):
        height_spline.append(f(time[dt], chainage[dt]))
        pass
    height_splines = np.asarray(height_spline)

    residuals = []
    # calculate the residuals:
    for d in range(len(height)):
        residuals.append(height[d] - height_splines[d])
        pass

    print('length residuals ', len(residuals))
    print('length df ', len(dfn))

    # save the residuals in the dataframe:
    dfn = dfn.assign(residual=residuals)

    # calculate the standard deviation
    std_res = np.std(dfn['residual'])

    # count the number of outlayer
    number_bigger_3sigma = len(dfn[np.abs(dfn['residual'])>s*std_res])

    # Find elements bigger than 3sigma and eliminate them iteratively
    while number_bigger_3sigma > 0:
        # eliminate biggest element
        # save all values higher 3 sigma in dfn_highersigme
        dfn_highersigma = dfn[np.abs(dfn['residual'])>s*std_res]

        # Extract the highest value
        max_val = np.max(np.abs(dfn_highersigma['residual']))
        dfn_drop = dfn_highersigma[np.abs(dfn_highersigma['residual']) == max_val[0]]
        try:
            dfn_drop_s
            check_i = 0
        except:
            dfn_drop_s = dfn_drop
            check_i = 1

        if check_i == 0:
            dfn_drop_s = dfn_drop_s.append(dfn_drop)
            pass

        # Eliminate the highest value
        dfn = dfn.drop(dfn_drop.index, axis=0)

        # extract the needed values from the dataframe:
        time = dfn['daytime'].values
        chainage = dfn['chainage_km'].values
        height = dfn['Surf1MinusEGM08'].values

        # Use the hight of the remaining values to calculate the polygon again
        f = interpolate.interp2d(time, chainage, height, kind='cubic')

        # estimate the spline:
        height_spline = []

        for dt in range(len(time)):
            height_spline.append(f(time[dt], chainage[dt]))
            pass

        height_splines = np.asarray(height_spline)

        residuals = []
        # calculate the residuals:
        for d in range(len(height)):
            residuals.append(height[d] - height_splines[d])
            pass

        # save the residuals in the dataframe:
        dfn = dfn.assign(residual=residuals)

        # calculate the standard deviation
        std_res = np.std(dfn['residual'])

        # calculate new number of elements bigger than 3sigma
        number_bigger_3sigma = len(dfn[np.abs(dfn['residual'])>s*std_res])

    height_spline = np.asarray(height_spline)
    z_new = []
    for dd in range(len(dfn['residual'])):
        z_new.append(dfn['residual'].values[dd] + height_spline[dd])
        pass
    dfn = dfn.assign(Surf1MinusEGM08_new = z_new)

    try:
        dfn_new
        count_i = 0
    except NameError:
         dfn_new = dfn     
         count_i = 1

    if count_i == 0:
        dfn_new = dfn_new.append(dfn) 
        pass


dfn_new = dfn_new.set_index('orbitNo')
return dfn_new, dfn_drop_s

我用sigme_size 3和step_size = 50表示有足够的数据点用于样条计算。此外，在消除了数据点的情况下，使用该功能后异常值仍然存在。

有人知道如何解决此问题吗？

非常感谢您！

使用三次样条进行插补的运行时警告

0 个答案: