注意:我已经看过并尝试了以前的question中的内容,但是并不能解决问题。
我想了解为什么xgboost.cv和sklearn.cross_val_score返回(略有不同)结果。为此,我进行了一个小实验。想法是在两个实现(本机python xgboost和sklearn)中设置相同的参数,并希望获得相同的结果...问题是我不能。
导入库和定义常量
from sklearn import datasets
from sklearn import model_selection
import pandas
import xgboost
# define random state and seeds
RANDOM_STATE = 123
SEEDS = 123
# definition parameters (present XGBRegressor)
base_score = 0.5
booster = "gbtree"
colsample_bylevel = 1
colsample_bytree = 1
gamma = 0
learning_rate = 0.1
max_delta_step = 0
max_depth = 5
min_child_weight = 2
missing = None
nestimator = 50
n_jobs=1
njobs = 1
nthread = -1
objective = "reg:linear"
random_state = RANDOM_STATE
reg_alpha = 0
reg_lambda = 1
scale_pos_weight = 1
seed = SEEDS
silent = True
subsample = 0.9
# parameters xgboost
disable_default_eval_metric = 1
colsample_bynode = 1
tree_method = "auto"
# for early stopping (we set early stopping = None for simplicity. If not None we have to pass eval_set and eval_metric)
early_stopping_rounds = None
eval_set = None
eval_metric = "mae"
加载波士顿数据集
# load boston dataset
dataset_boston = datasets.load_boston()
# extract X, y and transform in DMatrix format
X = pandas.DataFrame(data=dataset_boston.data,
columns=dataset_boston.feature_names)
y = pandas.Series(dataset_boston.target, name="PRICE")
data_dmat = xgboost.DMatrix(data=X, label=y)
定义交叉验证
# define 3 folds to be used both in xgboost.cv and sklearn.cross_val_score
folds = model_selection.KFold(n_splits=3,
shuffle=True,
random_state=RANDOM_STATE)
与SKLEARN.CROSS_VAL_SCORE合作
# put parameters in dictionary
param_sklearn = {
"base_score": base_score,
"booster": booster,
"colsample_bylevel": colsample_bylevel,
"colsample_bytree": colsample_bytree,
"gamma": gamma,
"learning_rate": learning_rate, # NOTE: in sklearn is called learning_rate not "eta"
"max_delta_step": max_delta_step,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"missing": missing,
"n_estimators": nestimator, # NOTE: in sklearn are called n_estimators
"n_jobs":n_jobs,
"njobs":njobs,
"nthread":nthread,
"objective": objective,
"random_state": RANDOM_STATE,
"reg_alpha": reg_alpha,
"reg_lambda": reg_lambda,
"scale_pos_weight": scale_pos_weight,
"seed": SEEDS,
"silent": silent,
"subsample": subsample,
# only xgboost
"disable_default_eval_metric": disable_default_eval_metric,
"eval_metric": eval_metric,
"colsample_bynode": colsample_bynode,
"tree_method": tree_method,
}
# initialie estimator
estimator = xgboost.sklearn.XGBRegressor(**param_sklearn)
# parameters to pass to the fit method
fit_params = {
"sample_weight": None,
"eval_set": eval_set,
"eval_metric": eval_metric,
"early_stopping_rounds": early_stopping_rounds,
"verbose": False,
"xgb_model": None,
"sample_weight_eval_set": None,
}
# we need to use a negative score because cross_val_score maximizes the score
scoring = "neg_mean_absolute_error"
results_cross_val_score = model_selection.cross_val_score(
estimator, X=X, y=y, scoring=scoring, fit_params=fit_params, cv=folds
)
print(results_cross_val_score.mean())
使用XGBOOST.CV工作
# xgboost parameters
params = {
"base_score": base_score,
"booster": booster,
"colsample_bylevel": colsample_bylevel,
"colsample_bytree": colsample_bytree,
"gamma": gamma,
"eta": learning_rate, # NOTE: in python implementation is called eta not "learning_rate"
"max_delta_step": max_delta_step,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"missing": missing,
"n_estimators": nestimator, # NOT? # NOTE: probably not used since we pass nrounds after
"n_jobs":n_jobs, # NOT?
"njobs":njobs, # NOT?
"nthread":nthread, # NOT?
"objective": objective, # NOT?
"random_state": RANDOM_STATE, # NOT?
"reg_alpha": reg_alpha,
"reg_lambda": reg_lambda,
"scale_pos_weight": scale_pos_weight,
"seed": SEEDS, # NOT?
"silent": silent,
"subsample": subsample,
# only Xgboost
"disable_default_eval_metric": disable_default_eval_metric,
"eval_metric": eval_metric,
"colsample_bynode": colsample_bynode,
"tree_method": tree_method,
}
# xgboost cv
cv_result = xgboost.cv(
params=params,
dtrain=data_dmat,
num_boost_round=nestimator, # NOTE: number of estimator is set here
folds=folds,
# nfold=3, # NOTE: should be overwritten by folds
# stratified=False, # NOTE: should be overwritten by folds
metrics=eval_metric,
obj=None,
feval=None,
maximize=False,
early_stopping_rounds=early_stopping_rounds, # NOTE: set to None to simplify debug
fpreproc=None,
as_pandas=True,
verbose_eval=False,
show_stdv=False,
seed=SEEDS,
callbacks=None,
# shuffle=True, # NOTE: should be overwritten by folds
)
results_xgboost_cv = cv_result["test-" + eval_metric + "-mean"].iloc[-1]
print(results_xgboost_cv)
运行上面的代码,我得到以下结果
-2.2714486795870408
2.2897483333333333
知道它们为什么与众不同(在其符号旁边)吗?
答案 我想我知道会发生什么...。 问题是设置subsample = 0.9 .... XGBoost在树木生长之前随机抽取0.9的训练数据,以防止过度拟合...我不知道如何在2种实现方式中以相同的随机值初始化随机采样。 。设置subsample = 1(即不进行任何二次采样),结果是
2.3969616666666664
2.396961763339098