如何在决策树类中使用分类变量?

时间:2019-11-15 19:58:02

标签: class decision-tree

我尝试编写一个简单的决策树类,但我不断遇到这个错误:fit()缺少1个必需的位置参数:'cat_features'。 即使我适合使用猫特征数组。 而且我看到我还有另一个错误,但我看不到它们在哪里。我该如何解决这个问题?

类DecisionTree:     def init (自身,条件):

    self.criterion = criterion
    self.feature_numbers_and_splits = []


def criterion_count(self, node_y):

    node_critetion = criterion_dict[self.criterion](node_y)
    return node_critetion

def _find_best_split(self, node_X, node_y, cat_feature=False):

    best_feature_number, best_split_value, best_score = None, None, -1

    for feature_number in range(node_X.shape[1]):
        unique_items = np.unique(node_X.iloc[:, feature_number]) 
        for split_value in unique_items:
            if not cat_feature:

                node_y_left = [y for j, y in enumerate(node_y) if node_X.iloc[j, feature_number] > split_value]
                node_y_right = [y for j, y in enumerate(node_y) if node_X.iloc[j, feature_number] <= split_value]

            else:
                node_y_left = [y for j, y in enumerate(node_y) if node_X.iloc[j, feature_number] == split_value]
                node_y_right = [y for j, y in enumerate(node_y) if node_X.iloc[j, feature_number] != split_value]

            criterion_left = self.criterion_count(node_y_left)
            criterion_right = self.criterion_count(node_y_right)

            score = criterion_left * len(node_y_left) / len(node_y) + criterion_right * len(node_y_right) / len(node_y)

            if best_score < score:
                best_split_value = split_value
                best_score = score
                best_feature_number = feature_number

    return best_feature_number, best_split_value


def _split_leaf(self, node_X, node_y, feature_number, split_value):


    left_node_X=node_X.loc[(node_X.iloc[:,feature_number] < split_value)]
    right_node_X=node_X.loc[(node_X.iloc[:,feature_number] >= split_value)]
    left_node_y=node_y.loc[(node_X.iloc[:,feature_number] < split_value)]
    right_node_y=node_y.loc[(node_X.iloc[:,feature_number] >= split_value)]

    return left_node_X, left_node_y, right_node_X, right_node_y


def fit(self, X, y, cat_features, verbose=False, recursion_depth=0):

    best_feature_number, best_split_value = self._find_best_split(X, y)

    if recursion_depth == 0:
        self.feature_numbers_and_splits.append((best_feature_number, best_split_value))
    else:
        self.feature_numbers_and_splits[recursion_depth].append((best_feature_number, best_split_value))


    left_node_X, left_node_y, right_node_X, right_node_y = \
                self._split_leaf(X, y, best_feature_number, best_split_value)

    if len(left_node_X) == 0 or len(right_node_X) == 0:

        return

    self.feature_numbers_and_splits.append([])
    print(self.feature_numbers_and_splits)
    self.fit(left_node_X, left_node_y)
    self.fit(right_node_X, right_node_y)


def predict(self, X, cat_features, id_=0, feature_number=None, split_value=None):


    if feature_number is None:
        feature_number, split_value = self.feature_numbers_and_splits[0]

    X_left = X[X.iloc[:,feature_number]<split_value]
    X_right = X[X.iloc[:,feature_number]>=split_value]



    y_pred_left = self.predict(X_left, cat_features, 
                               feature_number=self.feature_numbers_and_splits[id_+1][0][0],
                               split_value=self.feature_numbers_and_splits[id_+1][0][1])
    y_pred_right = self.predict(X_right, cat_features, 
                                feature_number=self.feature_numbers_and_splits[id_+1][1][0],
                                split_value=self.feature_numbers_and_splits[id_+1][1][1])

    X[X[feature_number]<split_value]['y'] = y_pred_left
    X[X[feature_number]>=split_value]['y'] = y_pred_right
    y_pred = X[['y']]

    return y_pred

0 个答案:

没有答案
相关问题