Question

我是StackOverflow社区的新手。我的目标是通过将Xgboost用于实际数据集的随机子样本（20％）来找到重要功能的列表。我的数据集有20万行和970列。我执行了4次代码。但是，我注意到提取的重要特征列表和迭代2到4的分数都相同。只有第一个迭代的重要功能与第2到第4个迭代有所不同。我很想知道为什么随机子样本会发生这种情况？还是我的代码中有任何错误？

下面是我的代码：

def getSample(frac):
    data =train.sample(frac=frac)
    return data

def modelfit(alg, X_all, y_all,i, useTrainCV=True, cv_folds=3, early_stopping_rounds=50):
    import xgboost as xgb

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_all, label=y_all)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=1)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(X_all, y_all, eval_metric=['auc'])

    print(alg)

    #Predict training set:
    dtrain_predictions = alg.predict(X_all)
    dtrain_predprob = alg.predict_proba(X_all)[:,1]

    sorted_idx = np.argsort(alg.feature_importances_)[::-1]
    f_name = []
    score = []
    print('Important Features:')
    for index in sorted_idx:
        f_name.append(columns[index])
        score.append(alg.feature_importances_[index])
    im_Feature= pd.DataFrame(f_name, columns =["f_name"])
    im_Feature["score"] = score
    im_Feature=im_Feature.head(49)
    out_file = "D:/input output csv_10" + "/data_{}.csv".format(i+1)

    im_Feature.to_csv(out_file)
    print(im_Feature.head())
    #Print model report:
    print("AUC Score (Train): %f" % metrics.roc_auc_score(y_all, dtrain_predprob))

    out_file = "D:/input output csv_10" + "/data_{}.png".format(i+1)

    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)[:25]
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

    plt.savefig(out_file)
    print(feat_imp.to_string())

迭代

for num in np.arange(4):
    X_all = getSample(0.20)
    print(X_all.head())
    y_all = X_all['Response']
    X_all = X_all.iloc[:,:-1].values
    print(X_all)
    from xgboost.sklearn import XGBClassifier
    xgb1 = XGBClassifier(learning_rate=0.05,
                     base_score=0.0056,
                     n_estimators=50,
                     min_child_weight=1,
                     gamma=0.1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=3)
    modelfit(xgb1, X_all, y_all, num)

这些数字在github

中可用

Xgboost：为数据的随机子样本生成同一组重要特征

0 个答案: