排除要素中的散点

时间:2014-09-03 13:19:09

标签: python scipy pattern-matching scikit-learn pca

我有一组数据点应该放在一个基因座上并遵循一个模式,但是我想要丢弃主要基因座上的一些散点,因为我需要一个干净的基因座以便稍后应用它我的分析。蓝点更多地是我想要找到的散点,并且通过复杂的方式将它们排除在外,而不是手动完成。enter image description here

我正在考虑使用像Nearest Neighbors Regression这样的东西,但我不确定这是否是最好的方法,或者我不太熟悉它应该如何实施才能给我一个合适的结果。顺便说一下,我想在没有任何适合程序的情况下这样做。 转置版本的数据如下:

X=array([[ 0.87 , -0.01 ,  0.575,  1.212,  0.382,  0.418, -0.01 ,  0.474,
         0.432,  0.702,  0.574,  0.45 ,  0.334,  0.565,  0.414,  0.873,
         0.381,  1.103,  0.848,  0.503,  0.27 ,  0.416,  0.939,  1.211,
         1.106,  0.321,  0.709,  0.744,  0.309,  0.247,  0.47 , -0.107,
         0.925,  1.127,  0.833,  0.963,  0.385,  0.572,  0.437,  0.577,
         0.461,  0.474,  1.046,  0.892,  0.313,  1.009,  1.048,  0.349,
         1.189,  0.302,  0.278,  0.629,  0.36 ,  1.188,  0.273,  0.191,
        -0.068,  0.95 ,  1.044,  0.776,  0.726,  1.035,  0.817,  0.55 ,
         0.387,  0.476,  0.473,  0.863,  0.252,  0.664,  0.365,  0.244,
         0.238,  1.203,  0.339,  0.528,  0.326,  0.347,  0.385,  1.139,
         0.748,  0.879,  0.324,  0.265,  0.328,  0.815,  0.38 ,  0.884,
         0.571,  0.416,  0.485,  0.683,  0.496,  0.488,  1.204,  1.18 ,
         0.465,  0.34 ,  0.335,  0.447,  0.28 ,  1.02 ,  0.519,  0.335,
         1.037,  1.126,  0.323,  0.452,  0.201,  0.321,  0.285,  0.587,
         0.292,  0.228,  0.303,  0.844,  0.229,  1.077,  0.864,  0.515,
         0.071,  0.346,  0.255,  0.88 ,  0.24 ,  0.533,  0.725,  0.339,
         0.546,  0.841,  0.43 ,  0.568,  0.311,  0.401,  0.212,  0.691,
         0.565,  0.292,  0.295,  0.587,  0.545,  0.817,  0.324,  0.456,
         0.267,  0.226,  0.262,  0.338,  1.124,  0.373,  0.814,  1.241,
         0.661,  0.229,  0.416,  1.103,  0.226,  1.168,  0.616,  0.593,
         0.803,  1.124,  0.06 ,  0.573,  0.664,  0.882,  0.286,  0.139,
         1.095,  1.112,  1.167,  0.589,  0.3  ,  0.578,  0.727,  0.252,
         0.174,  0.317,  0.427,  1.184,  0.397,  0.43 ,  0.229,  0.261,
         0.632,  0.938,  0.576,  0.37 ,  0.497,  0.54 ,  0.306,  0.315,
         0.335,  0.24 ,  0.344,  0.93 ,  0.134,  0.4  ,  0.223,  1.224,
         1.187,  1.031,  0.25 ,  0.53 , -0.147,  0.087,  0.374,  0.496,
         0.441,  0.884,  0.971,  0.749,  0.432,  0.582,  0.198,  0.615,
         1.146,  0.475,  0.595,  0.304,  0.416,  0.645,  0.281,  0.576,
         1.139,  0.316,  0.892,  0.648,  0.826,  0.299,  0.381,  0.926,
         0.606],
       [-0.154, -0.392, -0.262,  0.214, -0.403, -0.363, -0.461, -0.326,
        -0.349, -0.21 , -0.286, -0.358, -0.436, -0.297, -0.394, -0.166,
        -0.389,  0.029, -0.124, -0.335, -0.419, -0.373, -0.121,  0.358,
         0.042, -0.408, -0.189, -0.213, -0.418, -0.479, -0.303, -0.645,
        -0.153,  0.098, -0.171, -0.066, -0.368, -0.273, -0.329, -0.295,
        -0.362, -0.305, -0.052, -0.171, -0.406, -0.102,  0.011, -0.375,
         0.126, -0.411, -0.42 , -0.27 , -0.407,  0.144, -0.419, -0.465,
        -0.036, -0.099,  0.007, -0.167, -0.205, -0.011, -0.151, -0.267,
        -0.368, -0.342, -0.299, -0.143, -0.42 , -0.232, -0.368, -0.417,
        -0.432,  0.171, -0.388, -0.319, -0.407, -0.379, -0.353,  0.043,
        -0.211, -0.14 , -0.373, -0.431, -0.383, -0.142, -0.345, -0.144,
        -0.302, -0.38 , -0.337, -0.2  , -0.321, -0.269,  0.406,  0.223,
        -0.322, -0.395, -0.379, -0.324, -0.424,  0.01 , -0.298, -0.386,
         0.018,  0.157, -0.384, -0.327, -0.442, -0.388, -0.387, -0.272,
        -0.397, -0.415, -0.388, -0.106, -0.504,  0.034, -0.153, -0.32 ,
        -0.271, -0.417, -0.417, -0.136, -0.447, -0.279, -0.225, -0.372,
        -0.316, -0.161, -0.331, -0.261, -0.409, -0.338, -0.437, -0.242,
        -0.328, -0.403, -0.433, -0.274, -0.331, -0.163, -0.361, -0.298,
        -0.392, -0.447, -0.429, -0.388,  0.11 , -0.348, -0.174,  0.244,
        -0.182, -0.424, -0.319,  0.088, -0.547,  0.189, -0.216, -0.228,
        -0.17 ,  0.125, -0.073, -0.266, -0.234, -0.108, -0.395, -0.395,
         0.131,  0.074,  0.514, -0.235, -0.389, -0.288, -0.22 , -0.416,
        -0.777, -0.358, -0.31 ,  0.817, -0.363, -0.328, -0.424, -0.416,
        -0.248, -0.093, -0.28 , -0.357, -0.348, -0.298, -0.384, -0.394,
        -0.362, -0.415, -0.349, -0.08 , -0.572, -0.07 , -0.423,  0.359,
         0.4  ,  0.099, -0.426, -0.252, -0.697, -0.508, -0.348, -0.254,
        -0.307, -0.116, -0.029, -0.201, -0.302, -0.25 , -0.44 , -0.233,
         0.274, -0.295, -0.223, -0.398, -0.298, -0.209, -0.389, -0.247,
         0.225, -0.395, -0.124, -0.237, -0.104, -0.361, -0.335, -0.083,
        -0.254]])

2 个答案:

答案 0 :(得分:1)

我想提供以下程序,不一定是一个完美的答案,而是作为一个初学者,你可以升级或开发基于它的类似的东西。

发生了什么事?该过程将各点分别对应于它们的x值。对于每个组(bin),计算平均y值,并且丢弃具有超过某个预定义限制的最大偏差的点。然后再次计算平均值,依此类推。如果没有更多要丢弃的点,则考虑下一个箱。您将在代码中找到评论(希望)给出更明确的解释。

以下是代码:

def discard(X):
    """
    Group points together in x-bins; discard points for every bin which deviate more than dy from the average in an iterative procedure.
    """

    dx, dy = 0.1, 0.1   # dx: bin size; dy: max. deviation

    points = sorted(zip(X[0], X[1]), key=lambda p: p[0])    # sort the points respective to x

    xx = points[0][0]   # the smallest x-value
    xmax = points[-1][0]    # the greatest x-value

    while xx < xmax:    # loop over all bins
        loop = True
        while loop:
            tmp = [p for p in points if p[0] >= xx and p[0] < xx+dx]    # all points in the current bin
            try:
                av = sum([p[1] for p in tmp]) / len(tmp)    # the average y-value
            except ZeroDivisionError:    # no points within this bin, continue with next bin
                break
            dev = sorted([p for p in tmp if abs(p[1]-av) > dy], key=lambda p: abs(p[1]-av)) # all points which deviate more than dy from the average sorted by their deviation
            try:
                points.remove(dev[-1])  # discard the point with the greatest deviation
            except IndexError:
                loop = False    # if no point is deviating more than dy continue with the next bin
        xx += dx

    return [ [p[0] for p in points], [p[1] for p in points] ]


结果显然取决于dxdy的选择。以下是一些示例(分别丢弃蓝点)。对于dx, dy = 0.1, 0.1

for dx, dy = 0.1, 0.1

正如你所看到的那样,在右尾部丢弃了许多点,因为图形具有更大的斜率(因此最好使用更大的dy)。

dx, dy = 0.10, 0.15

for dx, dy = 0.10, 0.15

由于使用了更大的dy,因此在这种情况下丢弃的点数更少。但是,确保每个bin包含足够的点非常重要,否则程序可能会失败并丢弃错误的点,如下图所示,左尾(dx, dy = 0.09, 0.15):

for dx, dy = 0.09, 0.15

那么你知道从哪个dx, dy使用?最好的解决方案可能是保持变量。例如:

选择

dx,以便在每个垃圾箱中都有一个最小点数,以避免丢弃丢弃,如上例所示。
dy可以从曲线的斜率计算,即两个相邻区间的平均值之差除以其区域中心的差值。因此,更大的斜率会导致更大的dy

此过程在某种程度上类似于最近邻居算法,但它确实例如不检查每个单独的点。以下是可能修改的空间:您可以选择n,而不是取dx最近邻居,以便[x-dx, x+dx]间隔包括n点,并应用上述程序。< / p>

使用实际的最近邻算法可能会有问题,因为您只考虑一个点沿y轴的偏差,因此x值接近参考点的点非常有利。

答案 1 :(得分:1)

我使用scipy.stats.gaussian_kde来区分基因座与散点图。我可以通过排除低密度区域中的对象来确定异常值,其标准是密度应小于mean_kde除以 20 ,例如在我的情况下。我编写了一个函数,可以在不同的二维空间中处理太多相似的特征,并且同时排除所有类似问题的所有点。我还用周围空黑色圆圈的物体说明了被排除的散点图。

import numpy as np
import seaborn as sns
from scipy import stats
import pandas as pd
import pylab as plt

def refineLocus(Colorx,Colory, ratio):
    """
    Remove objects whose density is below 1/ratio of the mean density.
    The point density is estimated by Gaussian kernel convolution,
    with automatic bandwidth determination.
    Inputs: the color pair Colorx and Colory (the x- and y-color)
            the threshold ratio for the cut
    Output: object index array
    """
    for j in range(Colorx.shape[1]):
        X=np.array([[Colorx[i,j],Colory[i,j]] for i in range(Colorx.shape[0])])
        data = pd.DataFrame(X, columns=["X", "Y"])
        kernel = stats.gaussian_kde(X.T)
        kernel.set_bandwidth(bw_method='scott')
        KdeEval=np.zeros(Colorx.shape[0],float)
        for i in range(Colorx.shape[0]):
            KdeEval[i]=kernel.evaluate(X[i,:])
        ex=np.where(KdeEval<KdeEval.mean()/ratio)
        if (j==0):
           exarr=ex[0]
        else:
           if len(ex[0])!=0:
              exarr=np.unique(np.concatenate((exarr,ex[0])))
    for j in range(Colorx.shape[1]):
        X=np.array([[Colorx[i,j],Colory[i,j]] for i in range(Colorx.shape[0])])
        data = pd.DataFrame(X, columns=["X", "Y"])
        sns.kdeplot(data.X,data.Y,bw='scott',shade=True, cmap="Purples") 
        plt.scatter(Colorx[:,j],Colory[:,j],marker='.',s=2,color='red')
        plt.scatter(Colorx[exarr,j],Colory[exarr,j],s=10,facecolors='none', edgecolors='k')
        figname = 'refineLocus.kde.%d.pdf' % (j, )
        plt.savefig(figname)
        plt.close()
    return exarr

if __name__ == "__main__":
    Cy=np.loadtxt('Colory.asc')
    Cx=np.loadtxt('Colorx.asc')
    arr=refineLocus(Cx,Cy, ratio=25)
    r=arange(Cx.shape[0])
    new_r=setxor1d(r,arr)
    nCX=Cx[new_r]
    nCY=Cy[new_r]

enter image description here enter image description here