xgboost.cv和sklearn.cross_val_score之间的差异

时间:2018-12-28 16:36:23

标签: python scikit-learn cross-validation xgboost

注意:我已经看过并尝试了以前的question中的内容,但是并不能解决问题。

我想了解为什么xgboost.cv和sklearn.cross_val_score返回(略有不同)结果。为此,我进行了一个小实验。想法是在两个实现(本机python xgboost和sklearn)中设置相同的参数,并希望获得相同的结果...问题是我不能。

导入库和定义常量

from sklearn import datasets
from sklearn import model_selection
import pandas
import xgboost

# define random state and seeds
RANDOM_STATE = 123
SEEDS = 123

# definition parameters (present XGBRegressor)
base_score = 0.5
booster = "gbtree"
colsample_bylevel = 1
colsample_bytree = 1
gamma = 0
learning_rate = 0.1
max_delta_step = 0
max_depth = 5
min_child_weight = 2
missing = None
nestimator = 50
n_jobs=1
njobs = 1
nthread = -1
objective = "reg:linear"
random_state = RANDOM_STATE
reg_alpha = 0
reg_lambda = 1
scale_pos_weight = 1
seed = SEEDS
silent = True
subsample = 0.9

# parameters xgboost
disable_default_eval_metric = 1
colsample_bynode = 1
tree_method = "auto"

# for early stopping (we set early stopping = None for simplicity. If not None we have to pass eval_set and eval_metric)
early_stopping_rounds = None
eval_set = None
eval_metric = "mae"

加载波士顿数据集

# load boston dataset
dataset_boston = datasets.load_boston()

# extract X, y and transform in DMatrix format
X = pandas.DataFrame(data=dataset_boston.data, 
                     columns=dataset_boston.feature_names)
y = pandas.Series(dataset_boston.target, name="PRICE")
data_dmat = xgboost.DMatrix(data=X, label=y)

定义交叉验证

# define 3 folds to be used both in xgboost.cv and sklearn.cross_val_score
folds = model_selection.KFold(n_splits=3, 
                                      shuffle=True, 
                                      random_state=RANDOM_STATE)

与SKLEARN.CROSS_VAL_SCORE合作

# put parameters in dictionary
param_sklearn = {
    "base_score": base_score,
    "booster": booster,
    "colsample_bylevel": colsample_bylevel,
    "colsample_bytree": colsample_bytree,
    "gamma": gamma,
    "learning_rate": learning_rate, # NOTE: in sklearn is called learning_rate not "eta"
    "max_delta_step": max_delta_step,
    "max_depth": max_depth,
    "min_child_weight": min_child_weight,
    "missing": missing,
    "n_estimators": nestimator, # NOTE: in sklearn are called n_estimators
    "n_jobs":n_jobs,
    "njobs":njobs,
    "nthread":nthread,
    "objective": objective,
    "random_state": RANDOM_STATE,
    "reg_alpha": reg_alpha,
    "reg_lambda": reg_lambda,
    "scale_pos_weight": scale_pos_weight,
    "seed": SEEDS,
    "silent": silent,
    "subsample": subsample,
    # only xgboost
    "disable_default_eval_metric": disable_default_eval_metric,
    "eval_metric": eval_metric,
    "colsample_bynode": colsample_bynode,
    "tree_method": tree_method,
}
# initialie estimator 
estimator = xgboost.sklearn.XGBRegressor(**param_sklearn)

# parameters to pass to the fit method
fit_params = {
    "sample_weight": None,
    "eval_set": eval_set,
    "eval_metric": eval_metric,
    "early_stopping_rounds": early_stopping_rounds,
    "verbose": False,
    "xgb_model": None,
    "sample_weight_eval_set": None,
}

# we need to use a negative score because cross_val_score maximizes the score
scoring = "neg_mean_absolute_error"

results_cross_val_score = model_selection.cross_val_score(
    estimator, X=X, y=y, scoring=scoring, fit_params=fit_params, cv=folds
)

print(results_cross_val_score.mean())

使用XGBOOST.CV工作

# xgboost parameters
params = {
    "base_score": base_score,
    "booster": booster,
    "colsample_bylevel": colsample_bylevel,
    "colsample_bytree": colsample_bytree,
    "gamma": gamma,
    "eta": learning_rate, # NOTE: in python implementation is called eta not "learning_rate"
    "max_delta_step": max_delta_step,
    "max_depth": max_depth,
    "min_child_weight": min_child_weight,
    "missing": missing,
    "n_estimators": nestimator,  # NOT? # NOTE: probably not used since we pass nrounds after
    "n_jobs":n_jobs, # NOT?
    "njobs":njobs, # NOT?
    "nthread":nthread, # NOT?
    "objective": objective, # NOT?
    "random_state": RANDOM_STATE, # NOT?
    "reg_alpha": reg_alpha,
    "reg_lambda": reg_lambda,
    "scale_pos_weight": scale_pos_weight,
    "seed": SEEDS, # NOT?
    "silent": silent,
    "subsample": subsample,
    # only Xgboost
    "disable_default_eval_metric": disable_default_eval_metric,
    "eval_metric": eval_metric,
    "colsample_bynode": colsample_bynode,
    "tree_method": tree_method,
}

# xgboost cv
cv_result = xgboost.cv(
    params=params,
    dtrain=data_dmat,
    num_boost_round=nestimator, # NOTE: number of estimator is set here
    folds=folds,
    # nfold=3, # NOTE: should be overwritten by folds
    # stratified=False, # NOTE: should be overwritten by folds
    metrics=eval_metric,
    obj=None,
    feval=None,
    maximize=False,
    early_stopping_rounds=early_stopping_rounds, # NOTE: set to None to simplify debug
    fpreproc=None,
    as_pandas=True,
    verbose_eval=False,
    show_stdv=False,
    seed=SEEDS,
    callbacks=None,
    # shuffle=True, # NOTE: should be overwritten by folds
)

results_xgboost_cv = cv_result["test-" + eval_metric + "-mean"].iloc[-1]
print(results_xgboost_cv)

运行上面的代码,我得到以下结果

-2.2714486795870408
2.2897483333333333

知道它们为什么与众不同(在其符号旁边)吗?

答案 我想我知道会发生什么...。 问题是设置subsample = 0.9 .... XGBoost在树木生长之前随机抽取0.9的训练数据,以防止过度拟合...我不知道如何在2种实现方式中以相同的随机值初始化随机采样。 。设置subsample = 1(即不进行任何二次采样),结果是

2.3969616666666664
2.396961763339098

0 个答案:

没有答案