多个数据集上多个模型的多处理培训

时间:2018-01-29 10:32:33

标签: python-3.x scikit-learn python-multiprocessing

我有多个数据集,我想在这些数据集上训练多个模型。作为data_1的一个例子,我想训练randomforest和gradientboosting,然后对data_2等进行相同的训练。

使用多处理方法并行训练模型是最好的,也是最有效的方法。或者只是一次循环一次?是否应该使用机器学习模型来避免多处理,因为它们具有一些带有scikit-learn的内置多处理功能?

以下是我对使用池感兴趣的示例:

import numpy as np
import pandas as pd
import xgboost as xgb

from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV

import multiprocessing as mp
from multiprocessing import Pool, cpu_count
import os
import time
import threading

import inspect

from sklearn.datasets import load_iris


def apply_parallel_training(*fns, dataset):
    """
    Enable parallel computation and return DataFrame.
    """

    pool = Pool(mp.cpu_count()-1)

    ret_list = []

    for fn in fns:
        results = [pool.map_async(fn, dataset)]
        results = [p.get() for p in results]
        ret_list.append(results)

    pool.close()
    pool.join()
    time.sleep(1)

    flattened = [val for sublist in ret_list for val in sublist]
    flattened = [val for sublist in flattened for val in sublist]

    return flattened

def random_forest(x):
    """
    Specify RF model.
    """

    df = train_some_models(x = x, 
        clf = RandomForestClassifier(), 
        param_dist = {"n_estimators":  [500],
          "max_depth":  [5, 7],
          "max_features": ["auto", "log2"],
          "bootstrap":  [True, False],
          "criterion":  ["gini", "entropy"]
          }, 
        n_iter = 5, 
        model_name = inspect.stack()[0][3])

    return(df)

def gradient_boosting_tree(x):
    """
    Specify GBM model.
    """

    df = train_some_models(x = x, 
        clf = XGBClassifier(), 
        param_dist = {"n_estimators": [200],
          "learning_rate": [0.005, 0.01, 0.05, 0.1],
          "booster": ["gbtree"],
          "max_depth": [5, 7]
          }, 
        n_iter = 5, 
        model_name = inspect.stack()[0][3])

    return(df)


def train_some_models(x, clf, param_dist, n_iter, model_name):
    """
    Train models on a specified dataset and predict for known labels.

    Args:
        x: Dataset in the form of a dataframe.
        clf: Classifier.
        param_dist: Speficied model parameters.
        n_iter: Number of iterations to use in RandomizedCV.
        model_name: Name of model used for training.

    Returns:
        Dataframe with the prediction results per model.
    """

    Y = x[["target", "train_test_label"]].copy()
    X = x[x.columns.difference(["target"])]

    X_train = X[X.train_test_label == "train"]
    X_test = X[X.train_test_label == "test"]
    y_train = Y[Y.train_test_label == "train"]
    y_test = Y[Y.train_test_label == "test"]

    del X_train["train_test_label"]
    del X_test["train_test_label"]
    del y_train["train_test_label"]
    del y_test["train_test_label"]

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    random_clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter, 
        scoring = 'f1')

    random_clf.fit(X_train, y_train.ravel())

    y_class = random_clf.predict(X_test)
    y_class = pd.DataFrame(y_class)
    y_class.columns = ["y_class"]
    y_class["model_name"] = model_name

    df = pd.concat([y_class, pd.DataFrame(y_test)], axis = 1)

    return(df)


if __name__ == '__main__':

    iris = load_iris()

    iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                         columns= iris['feature_names'] + ['target'])

    # Making it into a binary classification problem
    iris["target"] = np.where(iris["target"].values == 2, 1, 0)

    # Create training and testing samples
    iris_train_1 = iris.loc[0:70,:]
    iris_test_1 = iris.loc[70:100,:]
    iris_train_2 = iris.loc[100:130,:]
    iris_test_2 = iris.loc[130:150,:]

    # Assign an indicator to tell which is training and which is testing
    iris_train_1 = iris_train_1.assign(train_test_label="train")
    iris_test_1 = iris_test_1.assign(train_test_label="test")

    # Concatenate the training and testing sets together
    iris_1 = pd.concat([iris_train_1, iris_test_1], axis = 0)

    iris_train_2 = iris_train_2.assign(train_test_label="train")
    iris_test_2 = iris_test_2.assign(train_test_label="test")

    iris_2 = pd.concat([iris_train_2, iris_test_2], axis = 0)

    # Put them into a list to iterate over with multiprocessing.Pools
    dataset = [iris_1, iris_2]

    # Start the training process
    results = apply_parallel_training(random_forest, gradient_boosting_tree,
        dataset = dataset)

    print(pd.concat(results))

0 个答案:

没有答案