从python中的xgboost.cv获得无法预测的结果

时间:2016-10-28 14:46:57

标签: python cross-validation xgboost

在R xgboost包中,我可以指定predictions=TRUE来保存交叉验证期间的折叠预测,例如:

library(xgboost)
data(mtcars)
xgb_params = list(
  max_depth = 1,
  eta = 0.01
)
x = model.matrix(mpg~0+., mtcars)
train = xgb.DMatrix(x, label=mtcars$mpg)
res = xgb.cv(xgb_params, train, 100, prediction=TRUE, nfold=5)
print(head(res$pred))

我如何在python包中执行等效操作?我在python中找不到prediction的{​​{1}}参数。

3 个答案:

答案 0 :(得分:8)

我不确定这是否是你想要的,但是你可以通过使用xgboost的sklearn包装器来实现这一点:(我知道我使用iris数据集作为回归问题 - 它不是,但这是用于说明)。

import xgboost as xgb
from sklearn.cross_validation import cross_val_predict as cvp
from sklearn import datasets
X = datasets.load_iris().data[:, :2]
y = datasets.load_iris().target
xgb_model = xgb.XGBRegressor()
y_pred = cvp(xgb_model, X, y, cv=3, n_jobs = 1)
y_pred


array([  9.07209516e-01,   1.84738374e+00,   1.78878939e+00,
         1.83672094e+00,   9.07209516e-01,   9.07209516e-01,
         1.77482617e+00,   9.07209516e-01,   1.75681138e+00,
         1.83672094e+00,   9.07209516e-01,   1.77482617e+00,
         1.84738374e+00,   1.84738374e+00,   1.12216723e+00,
         9.96944368e-01,   9.07209516e-01,   9.07209516e-01,
         9.96944368e-01,   9.07209516e-01,   9.07209516e-01,
         9.07209516e-01,   1.77482617e+00,   8.35850239e-01,
         1.77482617e+00,   9.87186074e-01,   9.07209516e-01,
         9.07209516e-01,   9.07209516e-01,   1.78878939e+00,
         1.83672094e+00,   9.07209516e-01,   9.07209516e-01,
         8.91427517e-01,   1.83672094e+00,   9.09049034e-01,
         8.91427517e-01,   1.83672094e+00,   1.84738374e+00,
         9.07209516e-01,   9.07209516e-01,   1.01038718e+00,
         1.78878939e+00,   9.07209516e-01,   9.07209516e-01,
         1.84738374e+00,   9.07209516e-01,   1.78878939e+00,
         9.07209516e-01,   8.35850239e-01,   1.99947178e+00,
         1.99947178e+00,   1.99947178e+00,   1.94922602e+00,
         1.99975276e+00,   1.91500926e+00,   1.99947178e+00,
         1.97454870e+00,   1.99947178e+00,   1.56287444e+00,
         1.96453893e+00,   1.99947178e+00,   1.99715066e+00,
         1.99947178e+00,   2.84575284e-01,   1.99947178e+00,
         2.84575284e-01,   2.00303388e+00,   1.99715066e+00,
         2.04597521e+00,   1.99947178e+00,   1.99975276e+00,
         2.00527954e+00,   1.99975276e+00,   1.99947178e+00,
         1.99947178e+00,   1.99975276e+00,   1.99947178e+00,
         1.99947178e+00,   1.91500926e+00,   1.95735490e+00,
         1.95735490e+00,   2.00303388e+00,   1.99975276e+00,
         5.92201948e-04,   1.99947178e+00,   1.99947178e+00,
         1.99715066e+00,   2.84575284e-01,   1.95735490e+00,
         1.89267385e+00,   1.99947178e+00,   2.00303388e+00,
         1.96453893e+00,   1.98232651e+00,   2.39597082e-01,
         2.39597082e-01,   1.99947178e+00,   1.97454870e+00,
         1.91500926e+00,   9.99531507e-01,   1.00023842e+00,
         1.00023842e+00,   1.00023842e+00,   1.00023842e+00,
         1.00023842e+00,   9.22234297e-01,   1.00023842e+00,
         1.00100708e+00,   1.16144836e-01,   1.00077248e+00,
         1.00023842e+00,   1.00023842e+00,   1.00100708e+00,
         1.00023842e+00,   1.00077248e+00,   1.00023842e+00,
         1.13711983e-01,   1.00023842e+00,   1.00135887e+00,
         1.00077248e+00,   1.00023842e+00,   1.00023842e+00,
         1.00023842e+00,   9.99531507e-01,   1.00077248e+00,
         1.00023842e+00,   1.00023842e+00,   1.00023842e+00,
         1.00023842e+00,   1.00023842e+00,   1.13711983e-01,
         1.00023842e+00,   1.00023842e+00,   1.00023842e+00,
         1.00023842e+00,   9.78098869e-01,   1.00023842e+00,
         1.00023842e+00,   1.00023842e+00,   1.00023842e+00,
         1.00023842e+00,   1.00023842e+00,   1.00077248e+00,
         9.99531507e-01,   1.00023842e+00,   1.00100708e+00,
         1.00023842e+00,   9.78098869e-01,   1.00023842e+00], dtype=float32)

答案 1 :(得分:3)

这可以通过xgboost.cv()实现,但有点笨拙。它使用回调和...一个全局变量,我被告知是不可取的。

def oof_prediction():
    """
    Dirty global variable callback hack.
    """

    global cv_prediction_dict

    def callback(env):
        """internal function"""        
        cv_prediction_list = []

        for i in [0, 1, 2, 3, 4]:
            cv_prediction_list.append([env.cvfolds[i].bst.predict(env.cvfolds[i].dtest)])

        cv_prediction_dict['cv'] = cv_prediction_list

    return callback

现在我们可以从xgboost.cv()调用回调,如下所示。

cv_prediction_dict = {}
xgb.cv(xgb_params, train, 100, callbacks=[oof_prediction()]), nfold=5)
pos_oof_predictions = cv_prediction_dict.copy()

即使使用early_stopping,它也会返回最后一次迭代/ num_boost_round的折叠预测。我认为这是R predictions=TRUE功能does/did无法正确执行的操作。

Hack免责声明:我知道这是相当hacky,但这是一个解决我对回调如何工作的糟糕理解的工作。如果有人知道如何做到这一点,请发表评论。

答案 2 :(得分:0)

以下是使用自定义callback函数的示例。此功能还可以保存最佳模型。

import os
def cv_misc_callback(model_dir:str=None, oof_preds:list=None, maximize=True):
    """
    To reduce memory and disk storage, only best models and best oof preds and stored
    For classification, the preds are scores before applying sigmoid.
    """
    state = {}
    def init(env):
        if maximize:
            state['best_score'] = -np.inf
        else:
            state['best_score'] = np.inf
        if (model_dir is not None) and  (not os.path.isdir(model_dir)):
            os.mkdir(model_dir)

        if oof_preds is not None:
            for i, _ in enumerate(env.cvfolds):
                oof_preds.append(None)

    def callback(env):
        if not state:
            init(env)
        best_score = state['best_score']
        score = env.evaluation_result_list[-1][1]
        if (maximize and score > best_score) or (not maximize and score < best_score):
            for i, cvpack in enumerate(env.cvfolds):
                if model_dir is not None:
                    cvpack.bst.save_model(f'{model_dir}/{i}.model')
                if oof_preds is not None:
                    oof_preds[i] = cvpack.bst.predict(cvpack.dtest)
            state['best_score'] = score

    callback.before_iteration = False
    return callback

简历代码:

eval_res = []
oof_preds = []
history = xgb.cv(params, dtrain, num_boost_round=1000,
                 folds=folds, early_stopping_rounds=40, seed=RANDOM_SEED,
                 callbacks=[cv_misc_callback('./models', oof_preds), xgb.callback.print_evaluation(period=10)])

将preds列表映射到train_data的oof_preds

oof_preds_proba = np.zeros(av_data.shape[0])
for i, (trn_idx, val_idx) in enumerate(folds):
    oof_preds_proba[val_idx] = sigmoid(oof_preds[i])
@jit
def sigmoid(x):
    return 1/(1 + np.exp(-x))