Question

解释

我有两个numpy数组：dataX和dataY，我试图过滤每个数组以减少噪音。下面显示的图像显示了实际的输入数据（蓝点）和我想要它的例子（红点）。我不需要过滤数据就像在示例中那样完美，但我确实希望它尽可能直接。我在代码中提供了示例数据。

我尝试了什么：

首先，您可以看到数据不是“连续的”，因此我首先将它们分成单独的“段”（在此示例中为4个），然后对每个“段”应用过滤器。有人建议我使用Savitzky-Golay filter。完整的可运行代码如下：

import scipy as sc
import scipy.signal
import numpy as np
import matplotlib.pyplot as plt

# Sample Data
ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])

# Used a diff array to find where there is a big change in Y. 
# If there's a big change in Y, then there must be a change of 'segment'.
diffy = np.diff(ydata)

# Create empty numpy arrays to append values into
filteredX = np.array([])
filteredY = np.array([])

# Chose 3 to be the value indicating the change in Y
index = np.where(diffy >3)

# Loop through the array
start = 0
for i in range (0, (index[0].size +1) ):
# Check if last segment is reached
    if i == index[0].size:
        print xdata[start:]
        partSize = xdata[start:].size
# Window length must be an odd integer
        if partSize % 2 == 0:
            partSize = partSize - 1

        filteredDataX = sc.signal.savgol_filter(xdata[start:], partSize, 3)
        filteredDataY = sc.signal.savgol_filter(ydata[start:], partSize, 3)
        filteredX = np.append(filteredX, filteredDataX)
        filteredY = np.append(filteredY, filteredDataY)

    else:
        print xdata[start:index[0][i]]
        partSize = xdata[start:index[0][i]].size
        if partSize % 2 == 0:
            partSize = partSize - 1
        filteredDataX = sc.signal.savgol_filter(xdata[start:index[0][i]], partSize, 3)
        filteredDataY = sc.signal.savgol_filter(ydata[start:index[0][i]], partSize, 3)
        start = index[0][i] 
        filteredX = np.append(filteredX, filteredDataX)
        filteredY = np.append(filteredY, filteredDataY)

# Plots
plt.plot(xdata,ydata, 'bo', label = 'Input Data')
plt.plot(filteredX, filteredY, 'ro', label = 'Filtered Data')

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Result')
plt.legend()
plt.show()

这是我的结果：连接每个点时，结果如下所示。我玩过订单，但似乎第三个订单给出了最好的结果。

我还尝试过这些过滤器，其中包括：

但到目前为止，我所尝试的过滤器都没有接近我真正想要的。筛选数据的最佳方法是什么？期待您的帮助。

Answer 1

让一些看起来接近理想的方法是聚类+线性回归。

请注意，您必须提供群集数量，并且在群集之前我还要放弃y。

import numpy as np
from scipy import cluster, stats

ydata = np.array([1,0,1,2,1,2,1,0,1,1,2,2,0,0,1,0,1,0,1,2,7,6,8,6,8,6,6,8,6,6,8,6,6,7,6,5,5,6,6, 10,11,12,13,12,11,10,10,11,10,12,11,10,10,10,10,12,12,10,10,17,16,15,17,16, 17,16,18,19,18,17,16,16,16,16,16,15,16])
xdata = np.array([1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32,33, 1,2,3,1,5,4,7,8,6,10,11,12,13,10,12,13,17,16,19,18,21,19,23,21,25,20,26,27,28,26,26,26,29,30,30,29,30,32])

def split_to_lines(x, y, k):
    yo = np.empty_like(y, dtype=float)
    # get the cluster centers and the labels for each point
    centers, map_ = cluster.vq.kmeans2(np.array((x, y * 2)).T.astype(float), k)
    # for each cluster, use the labels to select the points belonging to
    # the cluster and do a linear regression
    for i in range(k):
        slope, interc, *_ = stats.linregress(x[map_==i], y[map_==i])
        # use the regression parameters to construct y values on the
        # best fit line
        yo[map_==i] = x[map_==i] * slope + interc
    return yo

import pylab
pylab.plot(xdata, ydata, 'or')
pylab.plot(xdata, split_to_lines(xdata, ydata, 4), 'ob')
pylab.show()

在Python

1 个答案: