Question

我有一个类似算法的决策树。矩阵X中的数据在_split_data函数的2个子集中递归分割。此操作非常昂贵，因为它需要大量时间复制数据。

我想知道是否有更好的方法来分割数据或者更合适的数据结构（可能是pandas dataframe?）

def _split_data(X):
    ''' split the data in the left and right nodes ''' 
    n_samples, n_columns = X.shape
    n_features = n_columns - 1
    m = M = 0
    while m == M:
        feature_id = np.random.randint(low=0, high=n_features)
        feature = X[:, feature_id]
        m = feature.min()
        M = feature.max()
        #print(m, M, feature_id, X.shape)

    split_value = np.random.uniform(m, M, 1)
    left_X = X[feature <= split_value]
    right_X = X[feature > split_value]
    return left_X, right_X, feature_id, split_value


def iTree(X, add_index=False, max_depth = np.inf):            
    ''' construct an isolation tree and returns the number of step required
    to isolate an element. A column of index is added to the input matrix X if  
    add_index=True. This column is required in the algorithm. ''' 

    n_split = {} 
    def iterate(X, count = 0):

        n_samples, n_columns = X.shape
        n_features = n_columns - 1

        if count > max_depth:
            for index in X[:,-1]:
                n_split[index] = count
            return

        if n_samples == 1:
            index = X[0, n_columns-1]
            n_split[index] = count
            return 
        else:
            lX, rX, feature_id, split_value = _split_data(X)
            # Uncomment the print to visualize a draft of 
            # the construction of the tree
            #print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
            n_samples_lX, _ = lX.shape
            n_samples_rX, _ = rX.shape
            if n_samples_lX > 0:
                iterate(lX, count+1)
            if n_samples_rX >0:
                iterate(rX, count+1)

    if add_index:
        n_samples, _ = X.shape
        X = np.c_[X, range(n_samples)]

    iterate(X)
    return n_split

如何快速拆分决策树的数据，如算法

0 个答案: