我有多个数据集,我想在这些数据集上训练多个模型。作为data_1的一个例子,我想训练randomforest和gradientboosting,然后对data_2等进行相同的训练。
使用多处理方法并行训练模型是最好的,也是最有效的方法。或者只是一次循环一次?是否应该使用机器学习模型来避免多处理,因为它们具有一些带有scikit-learn的内置多处理功能?
以下是我对使用池感兴趣的示例:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import multiprocessing as mp
from multiprocessing import Pool, cpu_count
import os
import time
import threading
import inspect
from sklearn.datasets import load_iris
def apply_parallel_training(*fns, dataset):
"""
Enable parallel computation and return DataFrame.
"""
pool = Pool(mp.cpu_count()-1)
ret_list = []
for fn in fns:
results = [pool.map_async(fn, dataset)]
results = [p.get() for p in results]
ret_list.append(results)
pool.close()
pool.join()
time.sleep(1)
flattened = [val for sublist in ret_list for val in sublist]
flattened = [val for sublist in flattened for val in sublist]
return flattened
def random_forest(x):
"""
Specify RF model.
"""
df = train_some_models(x = x,
clf = RandomForestClassifier(),
param_dist = {"n_estimators": [500],
"max_depth": [5, 7],
"max_features": ["auto", "log2"],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]
},
n_iter = 5,
model_name = inspect.stack()[0][3])
return(df)
def gradient_boosting_tree(x):
"""
Specify GBM model.
"""
df = train_some_models(x = x,
clf = XGBClassifier(),
param_dist = {"n_estimators": [200],
"learning_rate": [0.005, 0.01, 0.05, 0.1],
"booster": ["gbtree"],
"max_depth": [5, 7]
},
n_iter = 5,
model_name = inspect.stack()[0][3])
return(df)
def train_some_models(x, clf, param_dist, n_iter, model_name):
"""
Train models on a specified dataset and predict for known labels.
Args:
x: Dataset in the form of a dataframe.
clf: Classifier.
param_dist: Speficied model parameters.
n_iter: Number of iterations to use in RandomizedCV.
model_name: Name of model used for training.
Returns:
Dataframe with the prediction results per model.
"""
Y = x[["target", "train_test_label"]].copy()
X = x[x.columns.difference(["target"])]
X_train = X[X.train_test_label == "train"]
X_test = X[X.train_test_label == "test"]
y_train = Y[Y.train_test_label == "train"]
y_test = Y[Y.train_test_label == "test"]
del X_train["train_test_label"]
del X_test["train_test_label"]
del y_train["train_test_label"]
del y_test["train_test_label"]
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
random_clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter,
scoring = 'f1')
random_clf.fit(X_train, y_train.ravel())
y_class = random_clf.predict(X_test)
y_class = pd.DataFrame(y_class)
y_class.columns = ["y_class"]
y_class["model_name"] = model_name
df = pd.concat([y_class, pd.DataFrame(y_test)], axis = 1)
return(df)
if __name__ == '__main__':
iris = load_iris()
iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
columns= iris['feature_names'] + ['target'])
# Making it into a binary classification problem
iris["target"] = np.where(iris["target"].values == 2, 1, 0)
# Create training and testing samples
iris_train_1 = iris.loc[0:70,:]
iris_test_1 = iris.loc[70:100,:]
iris_train_2 = iris.loc[100:130,:]
iris_test_2 = iris.loc[130:150,:]
# Assign an indicator to tell which is training and which is testing
iris_train_1 = iris_train_1.assign(train_test_label="train")
iris_test_1 = iris_test_1.assign(train_test_label="test")
# Concatenate the training and testing sets together
iris_1 = pd.concat([iris_train_1, iris_test_1], axis = 0)
iris_train_2 = iris_train_2.assign(train_test_label="train")
iris_test_2 = iris_test_2.assign(train_test_label="test")
iris_2 = pd.concat([iris_train_2, iris_test_2], axis = 0)
# Put them into a list to iterate over with multiprocessing.Pools
dataset = [iris_1, iris_2]
# Start the training process
results = apply_parallel_training(random_forest, gradient_boosting_tree,
dataset = dataset)
print(pd.concat(results))