我是StackOverflow社区的新手。我的目标是通过将Xgboost用于实际数据集的随机子样本(20%)来找到重要功能的列表。我的数据集有20万行和970列。我执行了4次代码。但是,我注意到提取的重要特征列表和迭代2到4的分数都相同。只有第一个迭代的重要功能与第2到第4个迭代有所不同。我很想知道为什么随机子样本会发生这种情况?还是我的代码中有任何错误?
下面是我的代码:
def getSample(frac):
data =train.sample(frac=frac)
return data
def modelfit(alg, X_all, y_all,i, useTrainCV=True, cv_folds=3, early_stopping_rounds=50):
import xgboost as xgb
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(X_all, label=y_all)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=1)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(X_all, y_all, eval_metric=['auc'])
print(alg)
#Predict training set:
dtrain_predictions = alg.predict(X_all)
dtrain_predprob = alg.predict_proba(X_all)[:,1]
sorted_idx = np.argsort(alg.feature_importances_)[::-1]
f_name = []
score = []
print('Important Features:')
for index in sorted_idx:
f_name.append(columns[index])
score.append(alg.feature_importances_[index])
im_Feature= pd.DataFrame(f_name, columns =["f_name"])
im_Feature["score"] = score
im_Feature=im_Feature.head(49)
out_file = "D:/input output csv_10" + "/data_{}.csv".format(i+1)
im_Feature.to_csv(out_file)
print(im_Feature.head())
#Print model report:
print("AUC Score (Train): %f" % metrics.roc_auc_score(y_all, dtrain_predprob))
out_file = "D:/input output csv_10" + "/data_{}.png".format(i+1)
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)[:25]
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.savefig(out_file)
print(feat_imp.to_string())
迭代
for num in np.arange(4):
X_all = getSample(0.20)
print(X_all.head())
y_all = X_all['Response']
X_all = X_all.iloc[:,:-1].values
print(X_all)
from xgboost.sklearn import XGBClassifier
xgb1 = XGBClassifier(learning_rate=0.05,
base_score=0.0056,
n_estimators=50,
min_child_weight=1,
gamma=0.1,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
nthread=4,
scale_pos_weight=3)
modelfit(xgb1, X_all, y_all, num)
这些数字在github
中可用