我正在尝试重现python中隔离林文件中描述的算法。 http://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf?q=isolation
这是我目前的代码:
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
def _h(i):
return np.log(i) + 0.5772156649
def _c(n):
if n > 2:
h = _h(n-1)
return 2*h - 2*(n - 1)/n
if n == 2:
return 1
else:
return 0
def _anomaly_score(dict_scores, n_samples):
score = np.array([np.mean(dict_scores[k]) for k in dict_scores.keys()])
score = -score/_c(n_samples)
return 2**score
def _split_data(X):
''' split the data in the left and right nodes '''
n_samples, n_columns = X.shape
n_features = n_columns - 1
feature_id = np.random.randint(low=0, high=n_features-1)
feature = X[:, feature_id]
split_value = np.random.choice(feature)
left_X = X[feature <= split_value]
right_X = X[feature > split_value]
return left_X, right_X, feature_id, split_value
def iTree(X, add_index=False, max_depth = np.inf):
''' construct an isolation tree and returns the number of step required
to isolate an element. A column of index is added to the input matrix X if
add_index=True. This column is required in the algorithm. '''
n_split = {}
def iterate(X, count = 0):
n_samples, n_columns = X.shape
n_features = n_columns - 1
if count > max_depth:
for index in X[:,-1]:
n_split[index] = count
return
if n_samples == 1:
index = X[0, n_columns-1]
n_split[index] = count
return
else:
lX, rX, feature_id, split_value = _split_data(X)
# Uncomment the print to visualize a draft of
# the construction of the tree
#print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
n_samples_lX, _ = lX.shape
n_samples_rX, _ = rX.shape
if n_samples_lX > 0:
iterate(lX, count+1)
if n_samples_rX >0:
iterate(rX, count+1)
if add_index:
n_samples, _ = X.shape
X = np.c_[X, range(n_samples)]
iterate(X)
return n_split
class iForest():
''' Class to construct the isolation forest.
-n_estimators: is the number of trees in the forest,
-sample_size: is the bootstrap parameter used during the construction
of the forest,
-add_index: adds a column of index to the matrix X. This is required and
add_index can be set to False only if the last column of X contains
already indeces.
-max_depth: is the maximum depth of each tree
'''
def __init__(self, n_estimators=20, sample_size=None, add_index = True,
max_depth = 100):
self.n_estimators = n_estimators
self.sample_size = sample_size
self.add_index = add_index
self.max_depth = max_depth
return
def fit(self, X):
n_samples, n_features = X.shape
if self.sample_size == None:
self.sample_size = int(n_samples/2)
if self.add_index:
X = np.c_[X, range(n_samples)]
trees = [iTree(X[np.random.choice(n_samples,
self.sample_size,
replace=False)],
max_depth=self.max_depth)
for i in range(self.n_estimators)]
self.all_anomaly_score_ = {k:None for k in range(n_samples)}
for k in self.all_anomaly_score_.keys():
self.all_anomaly_score_[k] = np.array([tree[k]
for tree in trees
if k in tree])
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, n_samples)
return self
代码的主要部分是iTree函数,它返回一个字典,其中包含隔离每个样本所需的步骤数。
索引列附加到输入矩阵X
,以便更容易理解每个节点中的样本。
当我将使用我的代码获得的结果与使用可用于R的隔离林获得的结果进行比较时,我会得到不同的结果。
例如考虑stackloss数据集:
data = pd.read_csv("stackloss.csv")
X = data.as_matrix()[:, 1:]
max_depth = 100
itree = iTree(X, add_index=True, max_depth=max_depth) #example of isolation tree
iforest = iForest(n_estimators=1, max_depth=max_depth, sample_size=21) # isolation forest
iforest.fit(X)
sol = np.argsort(iforest.anomaly_score_)
#correct sol = [10 5 4 8 12 9 11 17 6 19 7 14 13 15 18 3 20 16 2 1 0]
sol
通常与使用R软件获得的正确解决方案不同。
https://r-forge.r-project.org/projects/iforest/
R中的正确解决方案已获得:
> tr = IsolationTrees(stackloss,ntree = 100000,hlim = 100, rFactor = 1)
> as = AnomalyScore(stackloss, tr)
> order(as$outF)
[1] 11 6 5 9 13 10 12 18 7 20 8 15 14 16 19 4 21 17 3 2 1
> order(as$outF)-1
[1] 10 5 4 8 12 9 11 17 6 19 7 14 13 15 18 3 20 16 2 1 0
>
错误在哪里?
答案 0 :(得分:5)
我终于能够解决问题了。 由于在数据的每次拆分中执行连续复制操作,代码仍然很慢。
这是算法的工作版本。
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
def _h(i):
return np.log(i) + 0.5772156649
def _c(n):
if n > 2:
h = _h(n-1)
return 2*h - 2*(n - 1)/n
if n == 2:
return 1
else:
return 0
def _anomaly_score(score, n_samples):
score = -score/_c(n_samples)
return 2**score
def _split_data(X):
''' split the data in the left and right nodes '''
n_samples, n_columns = X.shape
n_features = n_columns - 1
m = M = 0
while m == M:
feature_id = np.random.randint(low=0, high=n_features)
feature = X[:, feature_id]
m = feature.min()
M = feature.max()
#print(m, M, feature_id, X.shape)
split_value = np.random.uniform(m, M, 1)
left_X = X[feature <= split_value]
right_X = X[feature > split_value]
return left_X, right_X, feature_id, split_value
def iTree(X, add_index=False, max_depth = np.inf):
''' construct an isolation tree and returns the number of step required
to isolate an element. A column of index is added to the input matrix X if
add_index=True. This column is required in the algorithm. '''
n_split = {}
def iterate(X, count = 0):
n_samples, n_columns = X.shape
n_features = n_columns - 1
if count > max_depth:
for index in X[:,-1]:
n_split[index] = count
return
if n_samples == 1:
index = X[0, n_columns-1]
n_split[index] = count
return
else:
lX, rX, feature_id, split_value = _split_data(X)
# Uncomment the print to visualize a draft of
# the construction of the tree
#print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
n_samples_lX, _ = lX.shape
n_samples_rX, _ = rX.shape
if n_samples_lX > 0:
iterate(lX, count+1)
if n_samples_rX >0:
iterate(rX, count+1)
if add_index:
n_samples, _ = X.shape
X = np.c_[X, range(n_samples)]
iterate(X)
return n_split
class iForest():
''' Class to construct the isolation forest.
-n_estimators: is the number of trees in the forest,
-sample_size: is the bootstrap parameter used during the construction
of the forest,
-add_index: adds a column of index to the matrix X. This is required and
add_index can be set to False only if the last column of X contains
already indeces.
-max_depth: is the maximum depth of each tree
'''
def __init__(self, n_estimators=20, sample_size=None, add_index = True,
max_depth = 100):
self.n_estimators = n_estimators
self.sample_size = sample_size
self.add_index = add_index
self.max_depth = max_depth
return
def fit(self, X):
n_samples, n_features = X.shape
if self.sample_size == None:
self.sample_size = int(n_samples/2)
if self.add_index:
X = np.c_[X, range(n_samples)]
trees = [iTree(X[np.random.choice(n_samples,
self.sample_size,
replace=False)],
max_depth=self.max_depth)
for i in range(self.n_estimators)]
self.path_length_ = {k:None for k in range(n_samples)}
for k in self.path_length_.keys():
self.path_length_[k] = np.array([tree[k]
for tree in trees
if k in tree])
self.path_length_ = np.array([self.path_length_[k].mean() for k in
self.path_length_.keys()])
self.anomaly_score_ = _anomaly_score(self.path_length_, self.sample_size)
return self
答案 1 :(得分:3)
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, n_samples)
您正在使用n_samples计算_anomaly_score,这是样本的总数。但是,您正在使用子样本构建树。因此,当您计算平均搜索长度&#39; _c(n)&#39;您应该使用sample_size而不是n_samples,因为树是使用子样本构建的。所以,我相信你的代码应该是:
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, self.sample_size)
答案 2 :(得分:1)
scikit-learn中有一个拉取请求:https://github.com/scikit-learn/scikit-learn/pull/4163
答案 3 :(得分:1)
Donbeo,你的代码只需要进行一些小的调整就能很好地工作,它的主要问题是你错过了递归算法的一个基本情况(结束条件),所以当条件出现时它会挂起循环。你需要在_split_data函数中有这样的效果(如下面的代码所示),并在迭代函数中处理这种情况(未显示)
minv = maxv = 0
inspected = Set() # this set tracks the candidates that we already inspected
while minv == maxv:
# check whether we run out of features to try an none of them has different values,
# if that is the case we need to break the loop otherwise this loops forever
if len(inspected) == n_features:
# if we run out of features to try an none of them has different values,
# return -1 to signal the caller that we can't split X anymore.
return X, X, -1, None
feature_id = np.random.randint(low=0, high=n_features)
if feature_id not in inspected:
inspected.add(feature_id)
split_feature = X[:, feature_id]
minv = split_feature.min()
maxv = split_feature.max()