将xgboost与multiprocessing.pool

时间:2019-02-14 08:51:53

标签: nested multiprocessing cross-validation xgboost pool

亲爱的数据处理专家。我在将multiprocessing.Pool与我自己的嵌套交叉验证功能结合在一起时遇到问题。

在此交叉验证功能中,我使用内部CV进行网格搜索以找到最佳的超参数,并使用外部CV评估所选的超参数。在下面的代码中,您可以找到一个可复制的示例,如果您将其复制,它将照常工作。我相当确定,问题的原因是 xgboost 软件包与 multiprocessing 软件包的相互作用。我发现特别奇怪的是,第一个外部交叉验证是按我希望的那样执行的,但是在第二个外部交叉验证的开始时,它在第一个“ _ inner_model.fit” 之前直接中断。

我正在以下位置运行此代码:
发行商:Linux Mint 19 Tara Kernel 4.15.0-38-generic
Pycharm 2019.2.4(社区版),内部版本#PC-182_152-release-1248-b8 amd 64
带有anaconda 5.3.0的Python 3.6
xgboost 0.81

我偶然发现了类似的问题/可能的解决方案:

https://github.com/dmlc/xgboost/issues/2163
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58378
https://gcc.gnu.org/bugzilla/attachment.cgi?id=30784&action=diff
https://github.com/joblib/joblib/issues/138
https://bugs.python.org/issue18999
https://hg.python.org/cpython/rev/72a5ac909c7a
https://github.com/dmlc/xgboost/issues/2163
https://github.com/scikit-learn/scikit-learn/issues/2889
https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
https://bisqwit.iki.fi/story/howto/openmp/#OpenmpAndFork
https://github.com/scikit-learn/scikit-learn/issues/6627


但仍然找不到解决方案。我敦促您不要因为代码量过多而退缩,因为我认为最可能的问题只是我提到的程序包交互,而不是我构造代码的方式。为了简化起见,我还在文章的底部做了一个非常类似的代码,该代码与第一个脚本几乎相同,但是使用了sscikit中的Random forest分类器。

代码:

import xgboost
import time
import itertools
import multiprocessing
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold
# Creating a function for inner crossvalidation (for gridsearch purposes) which will     be used as a function for Pool.apply
def multiprocess_inner_crossvalidation(
        _collection_of_hyperparameters_and_data,
        _X_train,
        _y_train,
        _skfInner,
        _scores):
    _inner_model =     xgboost.XGBClassifier(**(_collection_of_hyperparameters_and_data[0]))
    # inner cv fold splits
    for train_index_inner, test_index_inner in _skfInner.split(_X_train, _y_train):
        X_train_inner, X_test_inner = (_collection_of_hyperparameters_and_data[1])    [train_index_inner], \
                                          (_collection_of_hyperparameters_and_data[1])[test_index_inner]
        y_train_inner, y_test_inner = (_collection_of_hyperparameters_and_data[2])    [train_index_inner], \
                                          (_collection_of_hyperparameters_and_data[2])[test_index_inner]
        _inner_model.fit(
            X_train_inner,
            y_train_inner,
            verbose=False,
            early_stopping_rounds=20,
            eval_metric="auc",
            eval_set=[(X_test_inner, y_test_inner)])
        _scores.append(_inner_model.best_score)
    avgScore = float(sum(_scores)) / len(_scores)
    current__inner_model_params = _inner_model.get_params()
    return [current__inner_model_params, avgScore]
# just an indication to notify when new pool workers are created
def start_process():
    print('Starting', multiprocessing.current_process().name)
# defining the main function
def main_function(XGB_classifier_and_gridsearch_parameters):
    skf = StratifiedKFold(
        n_splits=5,
        shuffle=True)
    skfInner = StratifiedKFold(
        n_splits=5,
        shuffle=True)
    def my_product(dicts):
        return (dict(
            zip(dicts, x)) for x in itertools.product(*dicts.values()))
    bestModelScores = []
    bestModelsOuterparams = []
    outerCounter = 1
    #creating mock data
    X, y = datasets.make_classification(n_samples=500, n_features=30,
                                        n_informative=15, n_redundant=15,
                                        random_state=42)
    # outer cv splits
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print("\n", 80 * ("-"), " STARTED OUTER CROSSVALIDATION ", outerCounter, 80         * ("-"))
        scores = []
        hyperparameter_combinations = []
        collection_of_hyperparameters_and_data = []
        for params in my_product(XGB_classifier_and_gridsearch_parameters):
            hyperparameter_combinations.append(params)
        collection_of_hyperparameters_and_data = [[x, X_train, y_train] for x in     hyperparameter_combinations]
        inputs = collection_of_hyperparameters_and_data
        # print all tasks which will be forwarded to workers
        print("Pool tasks:")
        for i in inputs:
            print(i[0])
        #initializing  pool of workers and giving them tasks
        pool_size = multiprocessing.cpu_count() - 1
        pool = multiprocessing.Pool(processes=pool_size,
                                    initializer=start_process,
                                    )
        pool_outputs = [pool.apply_async(multiprocess_inner_crossvalidation,
                                         args=(x,
                                               X_train,
                                               y_train,
                                               skfInner,
                                               scores)) for x in inputs]
        pool.close()
        pool.join()
        # appending results and choosing parameters which gave best score
        pool_results = [p.get() for p in pool_outputs]
        tempModels = pool_results
        tempModels.sort(key=lambda x: x[1])
        bestMod = tempModels[-1][0]
        outerModel = xgboost.XGBClassifier(**bestMod)
        outerModel.fit(
            X_train,
            y_train,
            verbose=False,
            early_stopping_rounds=20,
            eval_metric="auc",
            eval_set=[(X_test, y_test)])
        #appending outer fold results
        bestModelScores.append(outerModel.best_score)
        outer_model_params = outerModel.get_params()
        bestModelsOuterparams.append([outer_model_params, outerModel.best_score])
        print("\n", 30 * ("-"), "FINISHING OUTER CROSS VALIDATION", outerCounter, 30     * ("-"))
        print('\nOuter crossvalidation score of the best inner model is:',     outerModel.best_score)
        outerCounter += 1
    #printing average results of the outer folds
    avgBestModelScores = float(sum(bestModelScores)) / len(bestModelScores)
    bestModelsOuterparams.sort(key=lambda x: x[1])
    print("Average score of outer crossvalidation models is", avgBestModelScores)
main_function(XGB_classifier_and_gridsearch_parameters={ 'objective':     ['binary:logistic'], 'nthread': [1], "max_depth": [5, 9], "subsample": [0.7, 1]})

输出和错误消息:

-------------------------------------------------------------------------------  STARTED OUTER CROSSVALIDATION  1 --------------------------------------------------------------------------------
Pool tasks:
{'objective': 'binary:logistic', 'nthread': 1, 'max_depth': 5, 'subsample': 0.7}
{'objective': 'binary:logistic', 'nthread': 1, 'max_depth': 5, 'subsample': 1}
{'objective': 'binary:logistic', 'nthread': 1, 'max_depth': 9, 'subsample': 0.7}
{'objective': 'binary:logistic', 'nthread': 1, 'max_depth': 9, 'subsample': 1}
Starting ForkPoolWorker-1
Starting ForkPoolWorker-2
Starting ForkPoolWorker-3

 ------------------------------ FINISHING OUTER CROSS VALIDATION 1 ------------------------------

Outer crossvalidation score of the best inner model is: 0.976078

 --------------------------------------------------------------------------------  STARTED OUTER CROSSVALIDATION  2 --------------------------------------------------------------------------------
Pool tasks:
{'objective': 'binary:logistic', 'nthread': 1, 'max_depth': 5, 'subsample': 0.7}
{'objective': 'binary:logistic', 'nthread': 1, 'max_depth': 5, 'subsample': 1}
{'objective': 'binary:logistic', 'nthread': 1, 'max_depth': 9, 'subsample': 0.7}
{'objective': 'binary:logistic', 'nthread': 1, 'max_depth': 9, 'subsample': 1}
Starting ForkPoolWorker-4
Starting ForkPoolWorker-5
Starting ForkPoolWorker-6
terminate called after throwing an instance of 'dmlc::Error'
  what():  [17:34:11] /workspace/include/xgboost/../../src/common/common.h:41: /workspace/src/common/host_device_vector.cu: 151: initialization error

Stack trace returned 10 entries:
[bt] (0) /home/user/envs/application/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7f283eeca5cd]
[bt] (1) /home/user/envs/application/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7f283eeca9c8]
[bt] (2) /home/user/envs/application/xgboost/libxgboost.so(dh::ThrowOnCudaError(cudaError, char const*, int)+0x178) [0x7f283f09cc18]
[bt] (3) /home/user/envs/application/xgboost/libxgboost.so(xgboost::HostDeviceVectorImpl<int>::DeviceShard::Init(xgboost::HostDeviceVectorImpl<int>*, int)+0x278) [0x7f283f0fa498]
[bt] (4) /home/user/envs/application/xgboost/libxgboost.so(+0x2fdfeb) [0x7f283f0ccfeb]
[bt] (5) /home/user/envs/application/xgboost/libxgboost.so(xgboost::HostDeviceVectorImpl<int>::Reshard(xgboost::GPUDistribution const&)+0x1b1) [0x7f283f0fb631]
[bt] (6) /home/user/envs/application/xgboost/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0x4f7) [0x7f283f0c06b7]
[bt] (7) /home/user/envs/application/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7f283ef411e2]
[bt] (8) /home/user/envs/application/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f283eec2ab5]
[bt] (9) /home/user/envs/application/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f28619dfec0]


terminate called after throwing an instance of 'dmlc::Error'
  what():  [17:34:11] /workspace/include/xgboost/../../src/common/common.h:41: /workspace/src/common/host_device_vector.cu: 151: initialization error

Stack trace returned 10 entries:
[bt] (0) /home/user/envs/application/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7f283eeca5cd]
[bt] (1) /home/user/envs/application/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7f283eeca9c8]
[bt] (2) /home/user/envs/application/xgboost/libxgboost.so(dh::ThrowOnCudaError(cudaError, char const*, int)+0x178) [0x7f283f09cc18]
[bt] (3) /home/user/envs/application/xgboost/libxgboost.so(xgboost::HostDeviceVectorImpl<int>::DeviceShard::Init(xgboost::HostDeviceVectorImpl<int>*, int)+0x278) [0x7f283f0fa498]
[bt] (4) /home/user/envs/application/xgboost/libxgboost.so(+0x2fdfeb) [0x7f283f0ccfeb]
[bt] (5) /home/user/envs/application/xgboost/libxgboost.so(xgboost::HostDeviceVectorImpl<int>::Reshard(xgboost::GPUDistribution const&)+0x1b1) [0x7f283f0fb631]
[bt] (6) /home/user/envs/application/xgboost/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0x4f7) [0x7f283f0c06b7]
[bt] (7) /home/user/envs/application/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7f283ef411e2]
[bt] (8) /home/user/envs/application/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f283eec2ab5]
[bt] (9) /home/user/envs/application/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f28619dfec0]


terminate called after throwing an instance of 'dmlc::Error'
  what():  [17:34:11] /workspace/include/xgboost/../../src/common/common.h:41: /workspace/src/common/host_device_vector.cu: 151: initialization error

Stack trace returned 10 entries:
[bt] (0) /home/user/envs/application/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7f283eeca5cd]
[bt] (1) /home/user/envs/application/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7f283eeca9c8]
[bt] (2) /home/user/envs/application/xgboost/libxgboost.so(dh::ThrowOnCudaError(cudaError, char const*, int)+0x178) [0x7f283f09cc18]
[bt] (3) /home/user/envs/application/xgboost/libxgboost.so(xgboost::HostDeviceVectorImpl<int>::DeviceShard::Init(xgboost::HostDeviceVectorImpl<int>*, int)+0x278) [0x7f283f0fa498]
[bt] (4) /home/user/envs/application/xgboost/libxgboost.so(+0x2fdfeb) [0x7f283f0ccfeb]
[bt] (5) /home/user/envs/application/xgboost/libxgboost.so(xgboost::HostDeviceVectorImpl<int>::Reshard(xgboost::GPUDistribution const&)+0x1b1) [0x7f283f0fb631]
[bt] (6) /home/user/envs/application/xgboost/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0x4f7) [0x7f283f0c06b7]
[bt] (7) /home/user/envs/application/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7f283ef411e2]
[bt] (8) /home/user/envs/application/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f283eec2ab5]
[bt] (9) /home/user/envs/application/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f28619dfec0]


Starting ForkPoolWorker-7
Starting ForkPoolWorker-8
terminate called after throwing an instance of 'dmlc::Error'
  what():  [17:34:11] /workspace/include/xgboost/../../src/common/common.h:41: /workspace/src/common/host_device_vector.cu: 151: initialization error

Stack trace returned 10 entries:
[bt] (0) /home/user/envs/application/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7f283eeca5cd]
[bt] (1) /home/user/envs/application/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7f283eeca9c8]
[bt] (2) /home/user/envs/application/xgboost/libxgboost.so(dh::ThrowOnCudaError(cudaError, char const*, int)+0x178) [0x7f283f09cc18]
[bt] (3) /home/user/envs/application/xgboost/libxgboost.so(xgboost::HostDeviceVectorImpl<int>::DeviceShard::Init(xgboost::HostDeviceVectorImpl<int>*, int)+0x278) [0x7f283f0fa498]
[bt] (4) /home/user/envs/application/xgboost/libxgboost.so(+0x2fdfeb) [0x7f283f0ccfeb]
[bt] (5) /home/user/envs/application/xgboost/libxgboost.so(xgboost::HostDeviceVectorImpl<int>::Reshard(xgboost::GPUDistribution const&)+0x1b1) [0x7f283f0fb631]
[bt] (6) /home/user/envs/application/xgboost/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0x4f7) [0x7f283f0c06b7]
[bt] (7) /home/user/envs/application/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7f283ef411e2]
[bt] (8) /home/user/envs/application/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f283eec2ab5]
[bt] (9) /home/user/envs/application/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f28619dfec0]


Starting ForkPoolWorker-9
Starting ForkPoolWorker-10

最后,不会引发错误的随机森林版本:

import xgboost
import time
import itertools
import multiprocessing
from multiprocessing import get_context
from sklearn import datasets
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
def multiprocess_inner_crossvalidation(
        _collection_of_hyperparameters_and_data,
        _X_train,
        _y_train,
        _skfInner,
        _scores):
    params = _collection_of_hyperparameters_and_data[0]
    print(params)
    _inner_model = RandomForestClassifier(**params)
    for train_index_inner, test_index_inner in _skfInner.split(_X_train, _y_train):
        X_train_inner, X_test_inner = (_collection_of_hyperparameters_and_data[1])[train_index_inner], \
                                      (_collection_of_hyperparameters_and_data[1])[test_index_inner]
        y_train_inner, y_test_inner = (_collection_of_hyperparameters_and_data[2])[train_index_inner], \
                                      (_collection_of_hyperparameters_and_data[2])[test_index_inner]
        _inner_model.fit(
            X_train_inner,
            y_train_inner,
            # verbose=False,
            # early_stopping_rounds=20,
            # eval_metric="auc",
            # eval_set=[(X_test_inner, y_test_inner)]
        )
        _scores.append(_inner_model.oob_score)
    avgScore = float(sum(_scores)) / len(_scores)
    current__inner_model_params = _inner_model.get_params()
    # tempModels.append([model.get_params(), avgScore])
    print("MOMENT BEFORE EXITING INNER LOOP")
    return [current__inner_model_params, avgScore]
def start_process():
    print('Starting', multiprocessing.current_process().name)
def main_function(XGB_classifier_and_gridsearch_parameters):
    skf = StratifiedKFold(
        n_splits=5,
        shuffle=True)
    skfInner = StratifiedKFold(
        n_splits=5,
        shuffle=True)
    def my_product(dicts):
        return (dict(
            zip(dicts, x)) for x in itertools.product(*dicts.values()))
    bestModelScores = []
    bestModelsOuterparams = []
    outerCounter = 1
    X, y = datasets.make_classification(n_samples=500, n_features=30,
                                        n_informative=15, n_redundant=15,
                                        random_state=42)
    start_time = time.time()
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print("\n", 80 * ("-"), " STARTED OUTER CROSSVALIDATION ", outerCounter, 80 * ("-"))
        scores = []
        hyperparameter_combinations = []
        collection_of_hyperparameters_and_data = []
        for params in my_product(XGB_classifier_and_gridsearch_parameters):
            hyperparameter_combinations.append(params)
        collection_of_hyperparameters_and_data = [[x, X_train, y_train] for x in hyperparameter_combinations]
        inputs = collection_of_hyperparameters_and_data
        for i in inputs:
            print(i[0])
        pool_size = multiprocessing.cpu_count()
        pool = multiprocessing.Pool(processes=pool_size,
                                    initializer=start_process,
                                    )
        pool_outputs = [pool.apply_async(multiprocess_inner_crossvalidation,
                                         args=(x,
                                               X_train,
                                               y_train,
                                               skfInner,
                                               scores)) for x in inputs]
        pool.close()  # no more tasks
        pool.join()  # wrap up current tasks
        pool_results = [p.get() for p in pool_outputs]
        print("Pool results", pool_results)
        tempModels = pool_results
        tempModels.sort(key=lambda x: x[1])
        bestMod = tempModels[-1][0]
        outerModel = RandomForestClassifier(**bestMod)
        outerModel.fit(
            X_train,
            y_train,
            # verbose=False,
            # early_stopping_rounds=20,
            # eval_metric="auc",
            # eval_set=[(X_test, y_test)]
        )
        bestModelScores.append(outerModel.feature_importances_)
        outer_model_params = outerModel.get_params()
        bestModelsOuterparams.append([outer_model_params, outerModel.feature_importances_])
        print("\n\n\n ", 30 * ("-"), "FINISHING OUTER CROSS VALIDATION", outerCounter, 30 * ("-"))
        print('\nOuter crossvalidation feature importances:', outerModel.feature_importances_)
        print("\n Current outer model params are:\n", outer_model_params, "\n\n\n\n")
        outerCounter += 1
    print("Standard procedure run time was ", start_time - time.time())
    # print("best outer model score was:", best_outer_mod_score)
    return
main_function(XGB_classifier_and_gridsearch_parameters= {"n_estimators": [10, 20], "max_depth": [2,3]})

0 个答案:

没有答案