Question

我接触所有SO c ++天才。

我已经在python中训练过（并成功测试过）xgboost模型，如下所示：

dtrain 
=xgb.DMatrix(np.asmatrix(X_train),label=np.asarray(y_train,dtype=np.int), feature_names=feat_names)

optimal_model = xgb.train(plst, dtrain)

dtest = xgb.DMatrix(np.asmatrix(X_test),feature_names=feat_names)

optimal_model.save_model('sigdet.model')

我已经关注了XgBoost（see link）上的帖子，该帖子解释了在c ++中加载和应用预测的正确方法：

// Load Model
g_learner = std::make_unique<Learner>(Learner::Create({}));
        std::unique_ptr<dmlc::Stream> fi(
            dmlc::Stream::Create(filename, "r"));
        g_learner->Load(fi.get());

// Predict
    DMatrixHandle h_test;
        XGDMatrixCreateFromMat((float *)features, 1, numFeatures , -999.9f, &h_test);
        xgboost::bst_ulong out_len;


        std::vector<float> preds;
        g_learner->Predict((DMatrix*)h_test,true, &preds);

我的问题（1）：我需要创建一个DMatrix *，但我只有一个DMatrixHandle。如何使用我的数据正确创建DMatrix？

我的问题（2）：当我尝试以下预测方法时：

DMatrixHandle h_test;
XGDMatrixCreateFromMat((float *)features, 1, numFeatures , -999.9f, &h_test);
xgboost::bst_ulong out_len;


int res = XGBoosterPredict(g_modelHandle, h_test, 1, 0, &out_len, (const float**)&scores);

我得到的分数完全不同于加载完全相同的模型并将其用于预测（在python中）。

无论是谁帮助我在c ++和python中获得一致的结果都可能会进入天堂。顺便说一下，我需要在c ++中为实时应用程序应用预测，否则我会使用不同的语言。

Answer 1

要获得DMatrix，您可以这样做：

g_learner->Predict(static_cast<std::shared_ptr<xgboost::DMatrix>*>(h_test)->get(), true, &pred);

对于问题（2），我没有答案。这实际上和我有同样的问题。我在python中得到了一个XGBRegression，我在C ++中使用相同的功能获得了不同的结果。

Answer 2

因此，您用于序列化模型的方法：

    optimal_model.save_model('sigdet.model')

此方法会剥离模型的所有功能名称（请参见https://github.com/dmlc/xgboost/issues/3089）。

将模型加载到C ++中进行预测时，不一定要保持列特征的顺序。您可以通过调用.dump_model（）方法进行验证。

另外，在您的Python和C ++模型对象上调用.dump_model（）会产生相同的决策树，但是Python拥有所有功能名称，而C ++可能拥有f0，f1，f2，.. ..您可以将二者进行比较以获得实际的列顺序，然后您的预测将跨语言匹配（不完全是b / c的四舍五入）。

我不知道如何对列进行排序，但是即使您在滑动数据窗口上重新训练相同的模型，这似乎也是一个保持排序的稳定过程。我在这里不是100％的自信，也希望您能讲清楚。

许多受过Python训练的，其他语言预测的XGBoost模型都存在此问题。我已经在Java中遇到了这个问题，而且似乎没有一种方法可以在XGBoost的不同绑定之间保持要素列顺序。

Answer 3

这是一个例子，但该程序的预测是相同的：

const int cols=3,rows=100;
float train[rows][cols];
for (int i=0;i<rows;i++)
    for (int j=0;j<cols;j++)
        train[i][j] = (i+1) * (j+1);

float train_labels[rows];
for (int i=0;i<50;i++)
    train_labels[i] = 0;
for (int i=50;i<rows;i++)
    train_labels[i] = 1;


// convert to DMatrix
DMatrixHandle h_train[1];
XGDMatrixCreateFromMat((float *) train, rows, cols, -1, &h_train[0]);

// load the labels
XGDMatrixSetFloatInfo(h_train[0], "label", train_labels, rows);

// read back the labels, just a sanity check
bst_ulong bst_result;
const float *out_floats;
XGDMatrixGetFloatInfo(h_train[0], "label" , &bst_result, &out_floats);
for (unsigned int i=0;i<bst_result;i++)
    std::cout << "label[" << i << "]=" << out_floats[i] << std::endl;

// create the booster and load some parameters
BoosterHandle h_booster;
XGBoosterCreate(h_train, 1, &h_booster);
XGBoosterSetParam(h_booster, "objective", "binary:logistic");
XGBoosterSetParam(h_booster, "eval_metric", "error");
XGBoosterSetParam(h_booster, "silent", "0");
XGBoosterSetParam(h_booster, "max_depth", "9");
XGBoosterSetParam(h_booster, "eta", "0.1");
XGBoosterSetParam(h_booster, "min_child_weight", "3");
XGBoosterSetParam(h_booster, "gamma", "0.6");
XGBoosterSetParam(h_booster, "colsample_bytree", "1");
XGBoosterSetParam(h_booster, "subsample", "1");
XGBoosterSetParam(h_booster, "reg_alpha", "10");

// perform 200 learning iterations
for (int iter=0; iter<10; iter++)
    XGBoosterUpdateOneIter(h_booster, iter, h_train[0]);

// predict
const int sample_rows = 100;
float test[sample_rows][cols];
for (int i=0;i<sample_rows;i++)
    for (int j=0;j<cols;j++)
        test[i][j] = (i+1) * (j+1);
DMatrixHandle h_test;
XGDMatrixCreateFromMat((float *) test, sample_rows, cols, -1, &h_test);
bst_ulong out_len;
const float *f;
XGBoosterPredict(h_booster, h_test, 0,0,&out_len,&f);

for (unsigned int i=0;i<out_len;i++)
    std::cout << "prediction[" << i << "]=" << f[i] << std::endl;


// free xgboost internal structures
XGDMatrixFree(h_train[0]);
XGDMatrixFree(h_test);
XGBoosterFree(h_booster);

Answer 4

在问题（2）中，使用python训练模型并使用C ++进行预测。特征向量是一个float *数组。

DMatrixHandle h_test;
XGDMatrixCreateFromMat((float *)features, 1, numFeatures , -999.9f, &h_test);
xgboost::bst_ulong out_len;
int res = XGBoosterPredict(g_modelHandle, h_test, 1, 0, &out_len, (const 
float**)&scores);

因此，您需要使用密集矩阵格式（numpy数组）来训练模型。以下是官方文档中的python代码段。

data = np.random.rand(5, 10)  # 5 entities, each contains 10 features
label = np.random.randint(2, size=5)  # binary target
dtrain = xgb.DMatrix(data, label=label)

c ++中的xgboost加载模型（python - ＆gt; c ++预测得分不匹配）

4 个答案: