对于以下异常值检测功能:
days = df['days'].dropna()
print(days.to_string())
1 350.0
2 641.0
5 389.0
6 130.0
9 344.0
16 92.0
21 392.0
24 51.0
25 28.0
28 358.0
31 309.0
34 64.0
35 380.0
36 491.0
44 332.0
46 410.0
52 66.0
54 435.0
58 156.0
59 294.0
60 75.0
63 284.0
64 105.0
68 34.0
69 50.0
75 155.0
77 427.0
78 327.0
81 116.0
87 97.0
88 274.0
89 315.0
93 99.0
95 70.0
103 62.0
106 241.0
108 397.0
110 50.0
112 41.0
115 231.0
116 238.0
117 216.0
126 105.0
140 36.0
141 192.0
144 38.0
147 122.0
150 37.0
159 236.0
163 175.0
169 138.0
179 146.0
202 125.0
208 144.0
210 166.0
221 19.0
240 155.0
242 130.0
255 54.0
264 120.0
270 65.0
271 95.0
275 158.0
280 92.0
301 65.0
313 52.0
318 91.0
329 67.0
332 38.0
333 72.0
357 36.0
393 14.0
399 74.0
402 155.0
409 503.0
411 110.0
412 338.0
428 444.0
438 408.0
439 107.0
448 214.0
449 291.0
454 91.0
455 277.0
461 96.0
462 325.0
463 154.0
465 314.0
468 377.0
470 147.0
471 48.0
482 224.0
486 75.0
490 268.0
500 135.0
502 177.0
508 133.0
509 306.0
510 187.0
515 145.0
520 353.0
521 148.0
539 182.0
545 95.0
547 82.0
548 64.0
552 143.0
557 79.0
567 168.0
582 141.0
585 224.0
598 82.0
617 202.0
635 107.0
637 169.0
639 153.0
659 156.0
660 79.0
666 49.0
679 126.0
687 44.0
694 67.0
704 64.0
708 102.0
721 74.0
807 56.0
810 102.0
814 285.0
817 386.0
826 176.0
833 106.0
838 6.0
842 322.0
844 72.0
847 192.0
848 429.0
855 101.0
856 159.0
867 168.0
872 319.0
874 178.0
880 323.0
881 295.0
886 151.0
887 286.0
889 93.0
891 336.0
901 252.0
903 111.0
904 49.0
905 113.0
915 214.0
926 230.0
960 77.0
962 192.0
964 219.0
979 166.0
981 72.0
989 143.0
999 166.0
1022 140.0
1023 191.0
1060 113.0
1061 83.0
1063 41.0
1070 28.0
1085 84.0
1105 78.0
1119 28.0
1147 202.0
1149 223.0
1157 188.0
1160 238.0
1161 212.0
1162 133.0
1164 235.0
1172 212.0
1175 243.0
1184 176.0
1195 167.0
1250 69.0
1251 108.0
1301 11.0
1306 35.0
1310 63.0
1323 38.0
1390 111.0
1391 135.0
1401 143.0
1426 70.0
1434 143.0
def outliers_iqr(ys):
quartile_1, quartile_3 = np.percentile(ys, [25, 75])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
return np.where(ys < lower_bound), np.where(ys > upper_bound)
outliers = outliers_iqr(days)
print(outliers)
我得到以下内容:
((array([], dtype=int64),), (array([ 1, 13, 74]),))
据我所知,返回了两个数组,其中一个数值&lt;下限(在我们的例子中是空白),另一个是值&gt;上限。所以这里没有低于下限的异常值,但有三个异常值大于上限。
如果我print("Count Outliers: " + str(len(days.where(days>upper_bound).dropna())))
我得到491,503和641(其中upper_bound是460)。如图所示,印刷阵列给出了1,13,74,我假设它是系列中异常值的位置。
但是,我怎样才能使用它:
percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0
_ = plt.plot(x_p, y_p, marker='D', color='red', linestyle='none')
ax=plt.gca()
if len(outliers)>0:
ax.fill_betweenx(y_p[0], outliers, x_p[0], where= outliers<x_p[0], facecolor='red', alpha=0.3)
ax.fill_betweenx(y_p[2], x_p[2], outliers, where= outliers>x_p[2], facecolor='red', alpha=0.3)
我在ax.fill_betweenx行中收到ValueError:The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
的错误,可能是由于多个“异常值”。
答案 0 :(得分:1)
以下是尝试使您的代码具有可重现性:
import numpy as np
import matplotlib.pyplot as plt
# note: it is much better than a printout of the variable, you can
# actually use this data in *days* variable to work with your example
days = [350.0, 641.0, 389.0, 130.0, 344.0, 92.0, 392.0, 51.0, 28.0, 358.0,
309.0, 64.0, 380.0, 491.0, 332.0, 410.0, 66.0, 435.0, 156.0, 294.0,
75.0, 284.0, 105.0, 34.0, 50.0, 155.0, 427.0, 327.0, 116.0, 97.0,
274.0, 315.0, 99.0, 70.0, 62.0, 241.0, 397.0, 50.0, 41.0, 231.0,
238.0, 216.0, 105.0, 36.0, 192.0, 38.0, 122.0, 37.0, 236.0, 175.0,
138.0, 146.0, 125.0, 144.0, 166.0, 19.0, 155.0, 130.0, 54.0, 120.0,
65.0, 95.0, 158.0, 92.0, 65.0, 52.0, 91.0, 67.0, 38.0, 72.0, 36.0,
14.0, 74.0, 155.0, 503.0, 110.0, 338.0, 444.0, 408.0, 107.0, 214.0,
291.0, 91.0, 277.0, 96.0, 325.0, 154.0, 314.0, 377.0, 147.0, 48.0,
224.0, 75.0, 268.0, 135.0, 177.0, 133.0, 306.0, 187.0, 145.0, 353.0,
148.0, 182.0, 95.0, 82.0, 64.0, 143.0, 79.0, 168.0, 141.0, 224.0, 82.0,
202.0, 107.0, 169.0, 153.0, 156.0, 79.0, 49.0, 126.0, 44.0, 67.0, 64.0,
102.0, 74.0, 56.0, 102.0, 285.0, 386.0, 176.0, 106.0, 6.0, 322.0, 72.0,
192.0, 429.0, 101.0, 159.0, 168.0, 319.0, 178.0, 323.0, 295.0, 151.0,
286.0, 93.0, 336.0, 252.0, 111.0, 49.0, 113.0, 214.0, 230.0, 77.0,
192.0, 219.0, 166.0, 72.0, 143.0, 166.0, 140.0, 191.0, 113.0, 83.0,
41.0, 28.0, 84.0, 78.0, 28.0, 202.0, 223.0, 188.0, 238.0, 212.0, 133.0,
235.0, 212.0, 243.0, 176.0, 167.0, 69.0, 108.0, 11.0, 35.0, 63.0, 38.0,
111.0, 135.0, 143.0, 70.0, 143.0]
def get_bounds(ys):
quartile_1, quartile_3 = np.percentile(ys, [25, 75])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
return lower_bound, upper_bound
def get_upper_outliers(ys):
lower_bound, upper_bound = get_bounds(days)
return [y for y in ys if y >= upper_bound]
def get_lower_outliers(ys):
lower_bound, upper_bound = get_bounds(days)
return [y for y in ys if y <= lower_bound]
max_outliers = get_upper_outliers(days)
min_outliers = get_lower_outliers(days)
assert max_outliers== [641.0, 491.0, 503.0]
assert min_outliers == []
percentiles = np.array([0, 25, 50, 75, 100])
x_p = np.percentile(days, percentiles)
y_p = percentiles
_ = plt.plot(x_p, y_p, marker='D', color='red', linestyle='none')
ax=plt.gca()
# my approximation of ax.fill_betweenx does not look right, but it is close
# note ax.fill_betweenx and ax.fill_between are different fucntions!
if min_outliers:
ax.fill_betweenx(y_p, 0, np.max(min_outliers), facecolor='red', alpha=0.3)
if max_outliers:
ax.fill_betweenx(y_p, np.min(max_outliers), x_p[4], facecolor='red', alpha=0.3)