我正在开发机器学习(使用Python版本2.7.13),我使用Hyperopt来处理数据并获得经过良好处理的数据的百分比。我想制作交叉验证多核,但这样做需要花费更多时间而不是单核。我正在使用带有atributte n_jobs的loblib来使它成为多核的。 代码就是这个:
import numpy as np
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from joblib import Parallel, delayed
import time
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import arff
import os
from weka.core.converters import Loader
from sklearn import preprocessing
from weka.classifiers import Classifier, Evaluation
import weka.core.jvm as jvm
timetot=time.time()
#TAKES DE DATA ARFF FILE FROM THE DIRECTORY AND CONVERTS IT TO BINARY, IN ORDER TO BE ABLE TO BE PROCESSED
########################################################
script_dir = os.getcwd()
rel_path = "data\\iris.arff"
iris_file = os.path.join(os.getcwd(), rel_path)
dataset = arff.load(open(iris_file, 'rb'))
#dataset = arff.load(open('C:\data\iris.arff', 'rb'))
mi_y = np.array(dataset['data'])
data = mi_y[:,0:mi_y.shape[1]-1]
data = np.array(mi_y[:,0:mi_y.shape[1]-1])
datos= data
datosunicos=np.unique(datos)
datosunicos=datosunicos.tolist()
unicos_datos = list(range(len(datosunicos)))
for j in range(len(datos[0])):
for i in range(len(datos)):
# print(datosunicos)
# print(datos[i,j])
posa = datosunicos.index(datos[i,j])
datos[i,j] = unicos_datos[posa]
data = datos.astype(np.float64)
#datosBinarios=MultiLabelBinarizer().fit_transform(data)
#y = mi_y
#y = mi_y[:,mi_y.shape[1]-1:mi_y.shape[1]]
y = mi_y[:,mi_y.shape[1]-1]
unicos = np.unique(y)
unicos = unicos.tolist()
unicos_numericos = list(range(len(unicos)))
bar = y
for i in range(len(bar)):
pos = unicos.index(bar[i])
bar[i] = unicos_numericos[pos]
y = bar.astype(np.int32)
X = data
#Xbuena = X.astype(np.float)
counter = 0
###########################################################
def hyperopt_train_test(params):
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
X_ = X[:]
if 'normalize' in params:
if params['normalize'] == 1:
X_ = normalize(X_)
del params['normalize']
if 'scale' in params:
if params['scale'] == 1:
X_ = scale(X_)
del params['scale']
# CHOOSE THE ALGORITHM TO BE USED TO PROCESS THE DATA AND CROSSVALIDATE **HERE IS WHERE I ASSIGN THE CORES WITH N_JOBS=-1
##########################################################
# clf = SVC(**params)
# clk =KNeighborsClassifier(**params)
# clnb=GaussianNB(**params)
clrf=RandomForestClassifier(**params)
# clmlp=MLPClassifier(**params)
#clf = SVR(**params)
return cross_val_score(clrf, X_, y, cv=10,n_jobs=-1).mean()
##########################################################
#DEFINE THE SEARCH SPACES FOR EACH ALGORITHM
space4svm = {
'C': hp.uniform('C', 0, 20),
'kernel': hp.choice('kernel', ['linear', 'sigmoid', 'poly', 'rbf']),
# 'kernel': hp.choice('kernel', ['linear']),
#'epsilon': hp.choice('epsilon', [0.1]),
'gamma': hp.uniform('gamma', 0, 20),
'scale': hp.choice('scale', [0, 1]),
'normalize': hp.choice('normalize', [0, 1])
}
space4KNN= {
'n_neighbors': hp.choice('n_neighbors',[1,2,3,4,5]),
'scale': hp.choice('scale', [0, 1]),
'normalize': hp.choice('normalize', [0, 1])
}
space4NB= {
'scale': hp.choice('scale', [0, 1]),
'normalize': hp.choice('normalize', [0, 1])
}
space4RF= {
'n_estimators': hp.choice('n_estimators',np.arange(10, 30, dtype=int)),
'max_features': hp.uniform('max_features',0.25, 1),
'scale': hp.choice('scale', [0, 1]),
'normalize': hp.choice('normalize', [0, 1])
}
space4MLP= {
'momentum': hp.uniform('momentum',0,0.05),
'scale': hp.choice('scale', [0, 1]),
'normalize': hp.choice('normalize', [0, 1])
}
def f(params):
acc = hyperopt_train_test(params)
global counter
counter = counter + 1
print counter, acc
return {'loss': -acc, 'status': STATUS_OK}
#HERE IS WHERE I WANT TO MAKE IT MULTICORE , WHEN IT CALLS FMIN FUNCTION
if __name__ == '__main__':
trials = Trials()
best = fmin(f,space4RF, algo=tpe.suggest, max_evals=100, trials=trials)
print 'best:'
print best
#CHOOSE THE PARAMETERS DEPENDING ON THE ALGORITHM TO USE
#############################################################
#parameters = ['C', 'kernel', 'gamma', 'scale', 'normalize']
#parameters = ['n_neighbors', 'scale', 'normalize']
#parameters = [ 'scale', 'normalize']
parameters = ['n_estimators','max_features', 'scale', 'normalize']
#parameters = ['momentum','scale', 'normalize']
#############################################################
cols = len(parameters)
f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(20,5))
cmap = plt.cm.jet
for i, val in enumerate(parameters):
xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()
ys = [-t['result']['loss'] for t in trials.trials]
#xs, ys = zip(\*sorted(zip(xs, ys)))
#xs, ys = zipped.sort(\*sorted(zip(xs, ys)))
axes[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.25, c=cmap(float(i)/len(parameters)))
axes[i].set_title(val)
axes[i].set_ylim([0.9, 1.0])
#PRINTS TOTAL TIME
print("TIEMPO TOTAL:")
print(time.time()-timetot)
我得到了96秒的处理能力,一个核心,296个核心处理。
非常感谢你的帮助。