在R xgboost包中,我可以指定predictions=TRUE
来保存交叉验证期间的折叠预测,例如:
library(xgboost)
data(mtcars)
xgb_params = list(
max_depth = 1,
eta = 0.01
)
x = model.matrix(mpg~0+., mtcars)
train = xgb.DMatrix(x, label=mtcars$mpg)
res = xgb.cv(xgb_params, train, 100, prediction=TRUE, nfold=5)
print(head(res$pred))
我如何在python包中执行等效操作?我在python中找不到prediction
的{{1}}参数。
答案 0 :(得分:8)
我不确定这是否是你想要的,但是你可以通过使用xgboost的sklearn包装器来实现这一点:(我知道我使用iris数据集作为回归问题 - 它不是,但这是用于说明)。
import xgboost as xgb
from sklearn.cross_validation import cross_val_predict as cvp
from sklearn import datasets
X = datasets.load_iris().data[:, :2]
y = datasets.load_iris().target
xgb_model = xgb.XGBRegressor()
y_pred = cvp(xgb_model, X, y, cv=3, n_jobs = 1)
y_pred
array([ 9.07209516e-01, 1.84738374e+00, 1.78878939e+00,
1.83672094e+00, 9.07209516e-01, 9.07209516e-01,
1.77482617e+00, 9.07209516e-01, 1.75681138e+00,
1.83672094e+00, 9.07209516e-01, 1.77482617e+00,
1.84738374e+00, 1.84738374e+00, 1.12216723e+00,
9.96944368e-01, 9.07209516e-01, 9.07209516e-01,
9.96944368e-01, 9.07209516e-01, 9.07209516e-01,
9.07209516e-01, 1.77482617e+00, 8.35850239e-01,
1.77482617e+00, 9.87186074e-01, 9.07209516e-01,
9.07209516e-01, 9.07209516e-01, 1.78878939e+00,
1.83672094e+00, 9.07209516e-01, 9.07209516e-01,
8.91427517e-01, 1.83672094e+00, 9.09049034e-01,
8.91427517e-01, 1.83672094e+00, 1.84738374e+00,
9.07209516e-01, 9.07209516e-01, 1.01038718e+00,
1.78878939e+00, 9.07209516e-01, 9.07209516e-01,
1.84738374e+00, 9.07209516e-01, 1.78878939e+00,
9.07209516e-01, 8.35850239e-01, 1.99947178e+00,
1.99947178e+00, 1.99947178e+00, 1.94922602e+00,
1.99975276e+00, 1.91500926e+00, 1.99947178e+00,
1.97454870e+00, 1.99947178e+00, 1.56287444e+00,
1.96453893e+00, 1.99947178e+00, 1.99715066e+00,
1.99947178e+00, 2.84575284e-01, 1.99947178e+00,
2.84575284e-01, 2.00303388e+00, 1.99715066e+00,
2.04597521e+00, 1.99947178e+00, 1.99975276e+00,
2.00527954e+00, 1.99975276e+00, 1.99947178e+00,
1.99947178e+00, 1.99975276e+00, 1.99947178e+00,
1.99947178e+00, 1.91500926e+00, 1.95735490e+00,
1.95735490e+00, 2.00303388e+00, 1.99975276e+00,
5.92201948e-04, 1.99947178e+00, 1.99947178e+00,
1.99715066e+00, 2.84575284e-01, 1.95735490e+00,
1.89267385e+00, 1.99947178e+00, 2.00303388e+00,
1.96453893e+00, 1.98232651e+00, 2.39597082e-01,
2.39597082e-01, 1.99947178e+00, 1.97454870e+00,
1.91500926e+00, 9.99531507e-01, 1.00023842e+00,
1.00023842e+00, 1.00023842e+00, 1.00023842e+00,
1.00023842e+00, 9.22234297e-01, 1.00023842e+00,
1.00100708e+00, 1.16144836e-01, 1.00077248e+00,
1.00023842e+00, 1.00023842e+00, 1.00100708e+00,
1.00023842e+00, 1.00077248e+00, 1.00023842e+00,
1.13711983e-01, 1.00023842e+00, 1.00135887e+00,
1.00077248e+00, 1.00023842e+00, 1.00023842e+00,
1.00023842e+00, 9.99531507e-01, 1.00077248e+00,
1.00023842e+00, 1.00023842e+00, 1.00023842e+00,
1.00023842e+00, 1.00023842e+00, 1.13711983e-01,
1.00023842e+00, 1.00023842e+00, 1.00023842e+00,
1.00023842e+00, 9.78098869e-01, 1.00023842e+00,
1.00023842e+00, 1.00023842e+00, 1.00023842e+00,
1.00023842e+00, 1.00023842e+00, 1.00077248e+00,
9.99531507e-01, 1.00023842e+00, 1.00100708e+00,
1.00023842e+00, 9.78098869e-01, 1.00023842e+00], dtype=float32)
答案 1 :(得分:3)
这可以通过xgboost.cv()
实现,但有点笨拙。它使用回调和...一个全局变量,我被告知是不可取的。
def oof_prediction():
"""
Dirty global variable callback hack.
"""
global cv_prediction_dict
def callback(env):
"""internal function"""
cv_prediction_list = []
for i in [0, 1, 2, 3, 4]:
cv_prediction_list.append([env.cvfolds[i].bst.predict(env.cvfolds[i].dtest)])
cv_prediction_dict['cv'] = cv_prediction_list
return callback
现在我们可以从xgboost.cv()
调用回调,如下所示。
cv_prediction_dict = {}
xgb.cv(xgb_params, train, 100, callbacks=[oof_prediction()]), nfold=5)
pos_oof_predictions = cv_prediction_dict.copy()
即使使用early_stopping
,它也会返回最后一次迭代/ num_boost_round的折叠预测。我认为这是R predictions=TRUE
功能does/did无法正确执行的操作。
Hack免责声明:我知道这是相当hacky,但这是一个解决我对回调如何工作的糟糕理解的工作。如果有人知道如何做到这一点,请发表评论。
答案 2 :(得分:0)
以下是使用自定义callback
函数的示例。此功能还可以保存最佳模型。
import os
def cv_misc_callback(model_dir:str=None, oof_preds:list=None, maximize=True):
"""
To reduce memory and disk storage, only best models and best oof preds and stored
For classification, the preds are scores before applying sigmoid.
"""
state = {}
def init(env):
if maximize:
state['best_score'] = -np.inf
else:
state['best_score'] = np.inf
if (model_dir is not None) and (not os.path.isdir(model_dir)):
os.mkdir(model_dir)
if oof_preds is not None:
for i, _ in enumerate(env.cvfolds):
oof_preds.append(None)
def callback(env):
if not state:
init(env)
best_score = state['best_score']
score = env.evaluation_result_list[-1][1]
if (maximize and score > best_score) or (not maximize and score < best_score):
for i, cvpack in enumerate(env.cvfolds):
if model_dir is not None:
cvpack.bst.save_model(f'{model_dir}/{i}.model')
if oof_preds is not None:
oof_preds[i] = cvpack.bst.predict(cvpack.dtest)
state['best_score'] = score
callback.before_iteration = False
return callback
简历代码:
eval_res = []
oof_preds = []
history = xgb.cv(params, dtrain, num_boost_round=1000,
folds=folds, early_stopping_rounds=40, seed=RANDOM_SEED,
callbacks=[cv_misc_callback('./models', oof_preds), xgb.callback.print_evaluation(period=10)])
将preds列表映射到train_data的oof_preds
oof_preds_proba = np.zeros(av_data.shape[0])
for i, (trn_idx, val_idx) in enumerate(folds):
oof_preds_proba[val_idx] = sigmoid(oof_preds[i])
@jit
def sigmoid(x):
return 1/(1 + np.exp(-x))