这是我现在面临的问题。我正在尝试使用seaborn.distplot()
绘制密度图(即直方图的平滑近似图),并得到下图:
上述图的问题在于,最左侧的轮廓远超过-1.0,而我不希望这样,因为相似性得分不能小于-1.0(即,它只能位于闭合区间{{ 1}})。我检查了输入数组(如下所示)的值是否小于-1.0,没有这样的值小于-1.0。因此,似乎[-1.0, 1.0]
使分布更平滑,使其扩展到-1.0以上。我该如何阻止这种情况的发生?我尝试在x轴上设置seaborn.distplot()
,但是在绘图的左侧没有留下任何空间(例如,在+1.0之后,我们在最右侧有单个列)。
为便于说明,这是我用于绘图的示例输入数组和代码:
xlim
arr = np.array([-0.35416853, -0.28675528, -0.54088942, 0.18797232, 0.01707244,
-0.48090636, -0.44454523, -0.03228283, -0.70861904, 0.02323842,
-0.54905541, -0.5421915 , 0.27547336, -0.92913273, -0.55379011,
-0.23521681, -0.1079175 , -0.24065031, -0.33773661, -0.06147251,
-0.74171701, -0.74315048, 0.06634989, -0.49222919, 0.48899574,
0.13499221, 0.53120786, -0.1688146 , 0.47125832, 0.36517109,
0.33110315, 0.34495851, 0.18393 , 0.67211736, 0.11608325,
-0.92913273, -0.71209124, 0.01828323, 0.30894561, -0.06463642,
0.45423401, -0.7993457 , 0.50007295, 0.17983021, -0.66105515,
-0.92783269, -0.49277017, -0.19487059, 0.07502782, 0.00700057,
0.29958942, -0.04223299, 0.04105657, -0.12604522, 0.30506049,
-0.15600141, -0.17434894, 0.01152945, -0.11583157, 0.07010729,
-0.92913273, -0.02566766, 0.48114331, -0.13252103, -0.42600686,
0.54836633, 0.37945642, -0.34006735, -0.29560479, 0.4930249 ,
0.02693856, 0.57255816, 0.31185216, 0.19780182, 0.11909931,
-0.02853919, -0.25082142, -0.08635957, -0.28266912, -0.80937364,
-0.92913273, -0.0172393 , -0.18993503, -0.69080226, -0.66901143,
0.0470842 , -0.45307088, 0.05043218, -0.20894534, -0.22218531,
0.5189177 , -0.92913273, 0.31509469, -0.15935917, -0.92913273,
-0.41652189, 0.20265061, 0.016976 , 0.0680205 , 0.33159134,
-0.3138477 , 0.10086817, 0.37074665, -0.06916329, -0.19177307,
0.22842641, -0.15087903, 0.34376167, 0.24173604, -0.38040409,
-0.20031291, 0.17990511, 0.40231535, -0.27195479, -0.15867829,
0.2389052 , 0.08337308, -0.07327617, -0.77566734, -0.12074809,
0.19539527, 0.03727124, -0.13330546, 0.13602168, 0.36673224,
-0.3434154 , 0.19251896, 0.27692974, 0.4757158 , 0.24333386,
0.29905657, 0.57319178, 0.46753947, -0.04079389, 0.5571865 ,
0.3453707 , 0.55110949, 0.19614831, 0.61707333, 0.3680048 ,
0.48193126, 0.67330892, 0.53603774, 0.54464057, 0.35016492,
0.36970268, 0.150395 , 0.4697073 , 0.3383952 , 0.4037419 ,
-0.01055328, 0.26734498, 0.2647191 , 0.30056532, 0.46706568,
0.41460328, 0.42295413, 0.44188908, 0.29304088, -0.18437651,
-0.33404869, 0.31744862, 0.16578238, -0.2903621 , -0.36128032,
-0.65571561, 0.39868119, -0.31359498, 0.45377302, 0.23929229,
0.19958669, 0.51978988, -0.01249307, -0.16404641, 0.27193916,
-0.11159726, -0.10719093, 0.05472177, -0.64784851, 0.25594644,
-0.26109644, -0.28908332, 0.06264426, 0.05689891, 0.26437733,
-0.29424862, 0.26441642, 0.34868516, 0.00497344, -0.46811445,
-0.35795662, -0.04599685, 0.08701907, -0.32572399, 0.17639076,
0.35640737, -0.08174591, -0.13910904, 0.35387245, 0.00857055,
-0.24789401, 0.24033791, -0.08525459, 0.19189512, 0.27148848,
-0.38631975, -0.08820518, 0.12658585, 0.23404602, 0.06062359,
0.13340842, -0.11942433, -0.15974527, -0.0236961 , 0.01533685,
-0.92641117, 0.01533685, -0.00582898, 0.08251113, -0.18537655,
-0.92641117, -0.63036561, -0.02408175, -0.10033362, -0.08820518,
0.01533685, -0.1475904 , -0.06573955, -0.10033362, -0.08820518,
-0.08820518, 0.04798457, 0.29057868, 0.08310757, 0.25168328,
0.03989156, 0.1895359 , -0.44324531, -0.16724842, 0.06172038,
0.05685105, 0.3381661 , -0.46472578, -0.13137012, 0.10249921,
0.26703853, 0.14798872, 0.09729466, -0.09559039, 0.38893042,
0.6081168 , -0.32574556, -0.11493626, 0.30370567, -0.13203101,
0.12251789, 0.29993512, -0.80796771, -0.14717629, 0.37894796,
0.30086822, 0.26228619, -0.01403568, -0.46596314, -0.11860131,
-0.52649509, 0.41834337, 0.25892792, 0.40497516, -0.0287142 ,
-0.14994142, 0.41714702, 0.40928704, 0.0595943 , 0.5190621 ,
0.53760238, 0.25452441, -0.08397463, 0.22131469, -0.46173602,
0.48456617, 0.44220971, 0.16059022, 0.43723123, 0.04680989,
-0.00131657, -0.09681387, -0.48600167, -0.44205123, 0.13787778,
-0.02900436, 0.07049823, 0.02565475, -0.20544388, 0.0297263 ,
0.09162641, -0.17354248, -0.41518963, 0.12393266, -0.41754063,
-0.19018751, 0.02251257, -0.27799953, 0.21135703, 0.09597453,
0.56175636, 0.34126265, 0.17056669, 0.13149045, -0.30472518,
-0.07366951, 0.42843431, -0.22890901, 0.05518269, -0.01007775,
-0.48123104, -0.44906545, 0.09229373, -0.85684002, 0.23411821,
0.02637603, 0.02477345, 0.21678001, -0.14454807, 0.32430986,
-0.12988135, 0.07014938, 0.17991853, -0.02405694, -0.83110188,
-0.11192697, 0.02312546, -0.10770876, 0.13470276, 0.10568144,
-0.20336714, -0.15739212, 0.21271663, 0.05357167, 0.3281988 ,
0.17442453, 0.11561338, -0.68398479, -0.03704769, 0.28698584,
0.17608064, 0.30424182, 0.51034264, -0.09452418, 0.38242868,
-0.60014916, 0.21856565, -0.04819684, 0.2653766 , 0.02992649,
0.18941891, -0.04752845, 0.02295903, -0.29201727, 0.07913569,
-0.12563984, 0.21124929, -0.18801383, -0.24118712, -0.29686842,
0.27609838, -0.23855832, 0.31970457, 0.41328374, 0.19630546,
0.34077982, -0.3704136 , 0.17032295, 0.20643397, 0.34154881,
0.1504677 , 0.37392242, 0.25842101, -0.50553798, 0.35387764,
0.41873554, 0.27067669, 0.31011181, -0.51092977, -0.10282291,
-0.4126883 , -0.52383119, -0.82821877, -0.4585979 , 0.2531493 ,
0.34361492, 0.38418371, -0.22988404, 0.285816 , -0.40203361,
0.38114577, 0.15781548, 0.27335741, 0.36371593, 0.36515941])
因此,我不想在图的左侧进行此平滑处理,而在图的最右侧保留一列的间距。我该如何实现?谢谢!
答案 0 :(得分:0)
获得理想结果的一种方法是使用自定义窗口和内核。内核和
窗口应取决于窗口中心相对于间隔的边缘点a
和b
的位置。
Searborn使用statsmodels中的stats.gaussian_kde
或kde estimatior(如果已安装)。
据我所知,gaussian_kde
不允许进行此类调整。因此,我们需要实现自定义kde估计器。
请看下面的代码片段,它可以正常工作,可以视为进一步改进的起点。
import numpy as np
from scipy.integrate import quad
class kde:
def __init__(self, a, b, kernel=None):
self.a = a
self.b = b
def h(self, x):
""" h(x) window size depends on position of the center of the window relative to (a, b).
_ r ___________ <-- rthumb = r = height of the graph
| / \
| / \
|---|--|----------|-|-----
a a+r b-r b
"""
if x > (self.a + self.rthumb) and x < (self.b - self.rthumb):
return self.rthumb
elif x >= self.a and (x < self.a + self.rthumb):
return x - self.a + np.finfo(float).eps
elif (x <= self.b) and (x > self.b - self.rthumb):
return self.b - x + np.finfo(float).eps
else:
return + np.finfo(float).eps
def kernel(self, x):
return np.exp(-0.5 * x ** 2) / np.sqrt(2 * np.pi);
def window(self, x):
""" x - scalar value """
def w(y):
res = self.kernel((x - y) / self.h(x)) # gaussian kernel
res[(y > self.b) | (y < self.a)] = 0.0 # window is zero outside [a, b]~
return res
return w
def fit(self, data):
# Rule of thumb
self.rthumb = 1.06 * np.std(data) * np.power(len(data), -1/5)
def _pdf(x):
ww = self.window(x)
return ww(data).sum() / len(data)
val = quad(_pdf, self.a, self.b)[0]
def pdf_norm(f):
def pn(x):
return f(x) / val
return pn
self.pdf = np.vectorize(pdf_norm(_pdf))
return self
如果我们将其应用于您的数据:
k = kde(-1, 1)
from pylab import plt
x = np.linspace(-1, 1, 100)
plt.plot(x, k.fit(arr).pdf(x))
我们得到:
如果滑动窗口靠近间隔的边缘,则会被截断:
plt.plot(x, k.window(0.9)(x), 'r.', x, k.window(0)(x), x, k.window(-.9)(x),'r.')
plt.show()
请注意,此自定义类会生成标准化的pdf估算值,例如AUC(kde.pdf)= 1。
已编辑:
我在h(x)值上添加了小值(浮点的epsilon 1),现在一切 在没有警告的情况下工作。
通常,在假设理论pdf是平滑函数的假设下,尝试获得核密度估计。就你而言 您可以截断使用scipy的gaussian_kde获得的pdf, 最后在截断的估计中添加一些常数以满足AUC = 1。 某些分布具有第一类不连续性,例如pdf的均匀分布。