计算以python实现的隔离林的异常分数

时间:2019-04-04 13:54:48

标签: python scikit-learn outliers anomaly-detection

我正在隔离林中工作。我实现了此代码,以确定实例的异常分数!但是我得到的分数在0.69到0.72之间!这是不合逻辑的,因为异常的得分> = 0.5,而正常实例的得分<0.5。但是根据我得到的结果,所有实例都是异常的,这是不正确的。谁能帮助我解决问题。

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

class ExNode:
    def __init__(self,size):
        self.size=size


class InNode:
    def __init__(self,left,right,splitAtt,splitVal):
        self.left=left
        self.right=right
        self.splitAtt=splitAtt
        self.splitVal=splitVal

def iForest(X,noOfTrees,sampleSize):
    forest=[]    
    hlim=int(np.ceil(np.log2(max(sampleSize, 2))))
    for i in range(noOfTrees):
        X_train=X.sample(sampleSize)
        forest.append(iTree(X_train,0,hlim))
    return forest


def iTree(X,currHeight,hlim):
    if currHeight>=hlim or len(X)<=1:
        return ExNode(len(X))
    else:
        Q=X.columns
        q=random.choice(Q)
        p=random.choice(X[q].unique())
        X_l=X[X[q]<p]
        X_r=X[X[q]>=p]
        return InNode(iTree(X_l,currHeight+1,hlim),iTree(X_r,currHeight+1,hlim),q,p)

def pathLength(x,Tree,currHeight):
    if isinstance(Tree,ExNode):
        return currHeight
    a=Tree.splitAtt
    if x[a]<Tree.splitVal:
        return pathLength(x,Tree.left,currHeight+1)
    else:
        return pathLength(x,Tree.right,currHeight+1)

def _h(i):
    return np.log2(i) + 0.5772156649 

def _c(n):
    if n > 2:
        h = _h(n-1)
        return 2*h - (2*(n - 1)/n)
    if n == 2:
        return 1
    else:
        return 0


def _anomaly_score(score, n_samples):
    score = -score/_c(n_samples)
    return 2**score

df=pd.read_csv("db.csv")
y_true=df['Target']
df_data=df.drop('Target',1)
sampleSize=256
ifor=iForest(df_data.sample(256),100,sampleSize)
train, test = train_test_split(df_data, test_size=0.3)

for index, row in test.iterrows():    
    sxn = 0;
    testLenLst = []
    for tree in ifor:
        testLenLst.append(pathLength(row,tree,0))             
    if(len(testLenLst) != 0):
        ehx = (sum(testLenLst) / float(len(testLenLst)))  
        if(_anomaly_score(ehx,262145) >= .5):
            print("Anomaly S(x,n) " + str(_anomaly_score(ehx,sampleSize)))
        else:
            print("Normal S(x,n) " + str(_anomaly_score(ehx,sampleSize)))

0 个答案:

没有答案