我们的数据中心内有 NVIDIA Tesla K80 GPU加速器计算,具有以下特征:Intel(R) Xeon(R) CPU E5-2670 v3 @2.30GHz, 48 CPU processors, 128GB RAM, 12 CPU cores
在Linux 64位下运行。
我正在运行以下代码,该代码在将GridSearchCV
模型的不同数据集垂直附加到单个数据系列之后执行RandomForestRegressor
。例如,我正在考虑的两个样本数据集可以在this link
from joblib import Parallel, delayed
import multiprocessing
import sys
import imp
import glob
import os
import pandas as pd
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.cross_validation import train_test_split
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "cubic*.csv"))), ignore_index=True)
#df = pd.read_csv('cubic31.csv')
for i in range(1,3):
df['X_t'+str(i)] = df['X'].shift(i)
print(df)
df.dropna(inplace=True)
X = (pd.DataFrame({ 'X_%d'%i : df['X'].shift(i) for i in range(3)}).apply(np.nan_to_num, axis=0).values)
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)
X_train = X_train.drop('time', axis=1)
X_test = X_test.drop('time', axis=1)
print(X.shape)
print(df['Y'].shape)
print()
print("Size of X_train:",(len(X_train)))
print("Size of Y_train:",(len(X_train)))
print("Size of X_test:",(len(X_test)))
print("Size of Y_test:",(len(y_test)))
print()
def gridSearchCVParallel():
#Fit models with some grid search CV=5 (not to low), use the best model
parameters = {'n_estimators': [10,30,100,500,1000]}
clf_rf = RandomForestRegressor(random_state=1)
clf = GridSearchCV(clf_rf, parameters, cv=5, scoring='neg_mean_squared_error')
model = clf.fit(X_train, y_train)
model.cv_results_['params'][model.best_index_]
math.sqrt(model.best_score_*-1)
model.grid_scores_
#####
print()
print(model.grid_scores_)
print("The best score: ",model.best_score_)
print("RMSE:",math.sqrt(model.best_score_*-1))
#reg = RandomForestRegressor(criterion='mse')
clf_rf.fit(X_train,y_train)
modelPrediction = clf_rf.predict(X_test)
print(modelPrediction)
print("Number of predictions:",len(modelPrediction))
meanSquaredError=mean_squared_error(y_test, modelPrediction)
print("Mean Square Error (MSE):", meanSquaredError)
rootMeanSquaredError = sqrt(meanSquaredError)
print("Root-Mean-Square Error (RMSE):", rootMeanSquaredError)
####### to add the trendline
fig, ax = plt.subplots()
#df.plot(x='time', y='Y', ax=ax)
ax.plot(df['time'].values, df['Y'].values)
fig, ax = plt.subplots()
index_values=range(0,len(y_test))
y_test.sort_index(inplace=True)
X_test.sort_index(inplace=True)
modelPred_test = clf_rf.predict(X_test)
ax.plot(pd.Series(index_values), y_test.values)
PlotInOne=pd.DataFrame(pd.concat([pd.Series(modelPred_test), pd.Series(y_test.values)], axis=1))
plt.figure(); PlotInOne.plot(); plt.legend(loc='best')
NumberOfCores = multiprocessing.cpu_count()
gridResults = Parallel(n_jobs=NumberOfCores)(delayed(gridSearchCVParallel))
print(gridResults)
当我最终为一个庞大的数据集(大约200万行)运行这个程序时,GridSearchCV
需要4天以上的时间。经过一些搜索后,我发现Python
个线程可以使用concurrent.futures
或multiprocessing
来使用多个CPU。正如我在代码中所示,我尝试使用multiplrocessing
,但我收到此错误TypeError: 'function' object is not iterable
。这似乎函数应该将一个参数作为输入,我们传入一个iterable作为参数。如何解决此问题以便利用多个CPU并在短时间内更快地完成任务?
提前谢谢。
答案 0 :(得分:1)
不尝试自行并行化。 不使用 joblib.Parallel
。无论如何,你将重新发明轮子,因为GridSearchCV
已经被parellized 。只需传递n_jobs
参数,默认为1
,即默认使用单个作业。要利用多核架构,请传递n_jobs = number_of_cores
,其中number_of_cores
是您要使用的核心数。
如果您检查source code,您会看到它基本上打包了joblib.Parallel,因此n_jobs=-1
应该适用于“所有核心”。