我正在尝试在Python中使用Ensemble Learning,我想我无法将训练和测试拆分分开进行10倍交叉验证。...有人在Python中知道这些问题吗?
我还想查看每个迭代结果和输出的标准偏差。...
我在macOS的Jupyter Notebook中使用Python 2.7.15 Anaconda
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn import metrics
df = pd.read_csv('/Users/arbade/Desktop/Datasets/realData.csv',encoding="utf-8")
X = df.drop(columns = ['mobileOp'])
y = df['mobileOp']
seed = 42
num_trees = 25
kfold = model_selection.KFold(n_splits=10, random_state=seed)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state = seed)
knn = KNeighborsClassifier(n_neighbors=3,metric='minkowski')
params_knn = {"n_neighbors": np.arange(1, 50)}
knn_gs = GridSearchCV(knn, params_knn, cv=kfold,iid=False)
knn_gs.fit(X_train, y_train)
knn.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_
prediction = knn.predict(X_test)
print(prediction)
rf = RandomForestClassifier(n_estimators=num_trees, random_state=seed,max_features="sqrt")
params_rf = {"n_estimators": [50,100]}
rf_gs = GridSearchCV(rf, params_rf, cv=kfold)
rf_gs.fit(X_train, y_train)
rf_best = rf_gs.best_estimator_
print(rf_gs.best_params_)
adaBoost = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
params_adaBoost = {"n_estimators": [50,100]}
adaBoost_gs = GridSearchCV(adaBoost, params_adaBoost, cv=kfold,iid=False)
adaBoost_gs.fit(X_train, y_train)
adaBoost_best = adaBoost_gs.best_estimator_
adaBoost.fit(X_train,y_train)
grBoost=GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
params_grBoost = {"n_estimators": [50,100]}
grBoost_gs = GridSearchCV(grBoost, params_grBoost, cv=kfold,iid=False)
grBoost_gs.fit(X_train, y_train)
grBoost_best = grBoost_gs.best_estimator_
grBoost.fit(X_train,y_train)
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,15), random_state=seed)
mlp.fit(X_train,y_train)
dtc=DecisionTreeClassifier(max_depth=10,random_state=seed,criterion='entropy')
dtc.fit(X_train,y_train)
svc = SVC(gamma='scale', kernel='rbf', probability=True,random_state=seed)
svc.fit(X_train,y_train)
nb=MultinomialNB()
nb.fit(X_train,y_train)
log_reg = LogisticRegression(penalty='l1',multi_class='multinomial',solver='saga',max_iter=100,C=1e5,random_state=seed,dual=False,intercept_scaling=1,verbose=0,n_jobs=3,class_weight=None)
print("KNN Classifier: {}".format(knn_best.score(X_test, y_test)))
print("Random Forest: {}".format(rf_best.score(X_test, y_test)))
print("Logistic Regression: {}".format(log_reg.score(X_test, y_test)))
print("SVC Classifier: {}".format(svc.score(X_test, y_test)))
print("Naive-Bayes Classifier: {}".format(nb.score(X_test, y_test)))
print("Desicion-Tree: {}".format(dtc.score(X_test, y_test)))
print("Multi-Layer Perceptron: {}".format(mlp.score(X_test, y_test)))
print("AdaBoost: {}".format(adaBoost_best.score(X_test, y_test)))
print("GradientBoosting Classifier: {}".format(grBoost_best.score(X_test, y_test)))
estimators=[("knn", knn_best), ("rf", rf_best), ("log_reg", log_reg),("nb",nb),("svc",svc),("dtc",dtc),("mlp",mlp),("adaBoost",adaBoost_best),('grBoost',grBoost_best)]
ensemble = VotingClassifier(estimators, voting="hard")
ensemble.fit(X_train, y_train)
a=ensemble.score(X_test, y_test)
ensPred=ensemble.predict(X_test)
results = model_selection.cross_val_score(ensemble,X,y,cv=kfold)
print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std()))
print("std: %",results.std()*100)
accuracy_score(y_test,ensPred)
#print('Ensemble Score: ' + repr(a) + '%')
print('Average Score:'+repr(results.mean()*100)+'%')
print(classification_report(y_test,ensPred))
#print(accuracy_score(y_test,ensPred))
#print("Accuracy:",metrics.accuracy_score(y_test, ensPred))
还有一些预期结果:
KNN:[](Each iteration for 10 fold,Accuracy and std) Random-Forest:[](Each iteration for 10 fold,Accuracy and std) . . . Multi-Layer Perceptron:[](Each iteration for 10 fold,Accuracy and std) Ensemble Score:[](Each iteration for 10 fold,Accuracy and std)