Question

对于以下异常值检测功能：

days = df['days'].dropna()
print(days.to_string())

1       350.0
2       641.0
5       389.0
6       130.0
9       344.0
16       92.0
21      392.0
24       51.0
25       28.0
28      358.0
31      309.0
34       64.0
35      380.0
36      491.0
44      332.0
46      410.0
52       66.0
54      435.0
58      156.0
59      294.0
60       75.0
63      284.0
64      105.0
68       34.0
69       50.0
75      155.0
77      427.0
78      327.0
81      116.0
87       97.0
88      274.0
89      315.0
93       99.0
95       70.0
103      62.0
106     241.0
108     397.0
110      50.0
112      41.0
115     231.0
116     238.0
117     216.0
126     105.0
140      36.0
141     192.0
144      38.0
147     122.0
150      37.0
159     236.0
163     175.0
169     138.0
179     146.0
202     125.0
208     144.0
210     166.0
221      19.0
240     155.0
242     130.0
255      54.0
264     120.0
270      65.0
271      95.0
275     158.0
280      92.0
301      65.0
313      52.0
318      91.0
329      67.0
332      38.0
333      72.0
357      36.0
393      14.0
399      74.0
402     155.0
409     503.0
411     110.0
412     338.0
428     444.0
438     408.0
439     107.0
448     214.0
449     291.0
454      91.0
455     277.0
461      96.0
462     325.0
463     154.0
465     314.0
468     377.0
470     147.0
471      48.0
482     224.0
486      75.0
490     268.0
500     135.0
502     177.0
508     133.0
509     306.0
510     187.0
515     145.0
520     353.0
521     148.0
539     182.0
545      95.0
547      82.0
548      64.0
552     143.0
557      79.0
567     168.0
582     141.0
585     224.0
598      82.0
617     202.0
635     107.0
637     169.0
639     153.0
659     156.0
660      79.0
666      49.0
679     126.0
687      44.0
694      67.0
704      64.0
708     102.0
721      74.0
807      56.0
810     102.0
814     285.0
817     386.0
826     176.0
833     106.0
838       6.0
842     322.0
844      72.0
847     192.0
848     429.0
855     101.0
856     159.0
867     168.0
872     319.0
874     178.0
880     323.0
881     295.0
886     151.0
887     286.0
889      93.0
891     336.0
901     252.0
903     111.0
904      49.0
905     113.0
915     214.0
926     230.0
960      77.0
962     192.0
964     219.0
979     166.0
981      72.0
989     143.0
999     166.0
1022    140.0
1023    191.0
1060    113.0
1061     83.0
1063     41.0
1070     28.0
1085     84.0
1105     78.0
1119     28.0
1147    202.0
1149    223.0
1157    188.0
1160    238.0
1161    212.0
1162    133.0
1164    235.0
1172    212.0
1175    243.0
1184    176.0
1195    167.0
1250     69.0
1251    108.0
1301     11.0
1306     35.0
1310     63.0
1323     38.0
1390    111.0
1391    135.0
1401    143.0
1426     70.0
1434    143.0

def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where(ys < lower_bound), np.where(ys > upper_bound)

outliers = outliers_iqr(days)   
print(outliers)

我得到以下内容：

((array([], dtype=int64),), (array([ 1, 13, 74]),))

据我所知，返回了两个数组，其中一个数值＆lt;下限（在我们的例子中是空白），另一个是值＆gt;上限。所以这里没有低于下限的异常值，但有三个异常值大于上限。

如果我print("Count Outliers: " + str(len(days.where(days>upper_bound).dropna())))我得到491,503和641（其中upper_bound是460）。如图所示，印刷阵列给出了1,13,74，我假设它是系列中异常值的位置。

但是，我怎样才能使用它：

percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0
_ = plt.plot(x_p, y_p, marker='D', color='red', linestyle='none')
ax=plt.gca() 
if len(outliers)>0:
    ax.fill_betweenx(y_p[0], outliers, x_p[0], where= outliers<x_p[0], facecolor='red', alpha=0.3)
    ax.fill_betweenx(y_p[2], x_p[2], outliers, where= outliers>x_p[2], facecolor='red', alpha=0.3)

我在ax.fill_betweenx行中收到ValueError：The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()的错误，可能是由于多个“异常值”。

Answer 1

以下是尝试使您的代码具有可重现性：

import numpy as np
import matplotlib.pyplot as plt

# note: it is much better than a printout of the variable, you can 
#       actually use this data in *days* variable to work with your example

days = [350.0, 641.0, 389.0, 130.0, 344.0, 92.0, 392.0, 51.0, 28.0, 358.0, 
        309.0, 64.0, 380.0, 491.0, 332.0, 410.0, 66.0, 435.0, 156.0, 294.0, 
        75.0, 284.0, 105.0, 34.0, 50.0, 155.0, 427.0, 327.0, 116.0, 97.0, 
        274.0, 315.0, 99.0, 70.0, 62.0, 241.0, 397.0, 50.0, 41.0, 231.0, 
        238.0, 216.0, 105.0, 36.0, 192.0, 38.0, 122.0, 37.0, 236.0, 175.0, 
        138.0, 146.0, 125.0, 144.0, 166.0, 19.0, 155.0, 130.0, 54.0, 120.0, 
        65.0, 95.0, 158.0, 92.0, 65.0, 52.0, 91.0, 67.0, 38.0, 72.0, 36.0, 
        14.0, 74.0, 155.0, 503.0, 110.0, 338.0, 444.0, 408.0, 107.0, 214.0, 
        291.0, 91.0, 277.0, 96.0, 325.0, 154.0, 314.0, 377.0, 147.0, 48.0, 
        224.0, 75.0, 268.0, 135.0, 177.0, 133.0, 306.0, 187.0, 145.0, 353.0, 
        148.0, 182.0, 95.0, 82.0, 64.0, 143.0, 79.0, 168.0, 141.0, 224.0, 82.0,
        202.0, 107.0, 169.0, 153.0, 156.0, 79.0, 49.0, 126.0, 44.0, 67.0, 64.0, 
        102.0, 74.0, 56.0, 102.0, 285.0, 386.0, 176.0, 106.0, 6.0, 322.0, 72.0, 
        192.0, 429.0, 101.0, 159.0, 168.0, 319.0, 178.0, 323.0, 295.0, 151.0, 
        286.0, 93.0, 336.0, 252.0, 111.0, 49.0, 113.0, 214.0, 230.0, 77.0,
        192.0, 219.0, 166.0, 72.0, 143.0, 166.0, 140.0, 191.0, 113.0, 83.0, 
        41.0, 28.0, 84.0, 78.0, 28.0, 202.0, 223.0, 188.0, 238.0, 212.0, 133.0, 
        235.0, 212.0, 243.0, 176.0, 167.0, 69.0, 108.0, 11.0, 35.0, 63.0, 38.0,
        111.0, 135.0, 143.0, 70.0, 143.0]

def get_bounds(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return lower_bound, upper_bound

def get_upper_outliers(ys):
    lower_bound, upper_bound = get_bounds(days)
    return [y for y in ys if y >= upper_bound]

def get_lower_outliers(ys):
    lower_bound, upper_bound = get_bounds(days)
    return [y for y in ys if y <= lower_bound]

max_outliers = get_upper_outliers(days) 
min_outliers = get_lower_outliers(days) 

assert max_outliers== [641.0, 491.0, 503.0]
assert min_outliers == []

percentiles = np.array([0, 25, 50, 75, 100])
x_p = np.percentile(days, percentiles)
y_p = percentiles
_ = plt.plot(x_p, y_p, marker='D', color='red', linestyle='none')
ax=plt.gca() 

# my approximation of  ax.fill_betweenx does not look right, but it is close
# note ax.fill_betweenx and ax.fill_between are different fucntions!

if min_outliers: 
    ax.fill_betweenx(y_p, 0, np.max(min_outliers), facecolor='red', alpha=0.3) 

if max_outliers: 
    ax.fill_betweenx(y_p, np.min(max_outliers), x_p[4], facecolor='red', alpha=0.3)

突出显示图中的异常值

1 个答案: