我有一个类似算法的决策树。矩阵X
中的数据在_split_data
函数的2个子集中递归分割。
此操作非常昂贵,因为它需要大量时间复制数据。
我想知道是否有更好的方法来分割数据或者更合适的数据结构(可能是pandas dataframe?
)
def _split_data(X):
''' split the data in the left and right nodes '''
n_samples, n_columns = X.shape
n_features = n_columns - 1
m = M = 0
while m == M:
feature_id = np.random.randint(low=0, high=n_features)
feature = X[:, feature_id]
m = feature.min()
M = feature.max()
#print(m, M, feature_id, X.shape)
split_value = np.random.uniform(m, M, 1)
left_X = X[feature <= split_value]
right_X = X[feature > split_value]
return left_X, right_X, feature_id, split_value
def iTree(X, add_index=False, max_depth = np.inf):
''' construct an isolation tree and returns the number of step required
to isolate an element. A column of index is added to the input matrix X if
add_index=True. This column is required in the algorithm. '''
n_split = {}
def iterate(X, count = 0):
n_samples, n_columns = X.shape
n_features = n_columns - 1
if count > max_depth:
for index in X[:,-1]:
n_split[index] = count
return
if n_samples == 1:
index = X[0, n_columns-1]
n_split[index] = count
return
else:
lX, rX, feature_id, split_value = _split_data(X)
# Uncomment the print to visualize a draft of
# the construction of the tree
#print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
n_samples_lX, _ = lX.shape
n_samples_rX, _ = rX.shape
if n_samples_lX > 0:
iterate(lX, count+1)
if n_samples_rX >0:
iterate(rX, count+1)
if add_index:
n_samples, _ = X.shape
X = np.c_[X, range(n_samples)]
iterate(X)
return n_split