我是machine learning
的初学者。我有dataset
,没有规范化,但我会在处理过程中使用StandardScaler
。我有多类(1、2,...,10类)
我想知道如何应用10倍交叉验证而不是train_test_split。
#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
#Creating Dataset and including the first row by setting no header as input
dataset = pd.read_csv('finalDataset.csv')
#Renaming the columns
#print('Shape of the dataset: ' + str(dataset.shape))
#print(dataset.head())
#Creating the dependent variable class
factor = pd.factorize(dataset['DJ class'])
definitions = factor[1]
#print(definitions)
#Splitting the data into independent and dependent variables
X = dataset.iloc[:,3:1941].values
y = dataset.iloc[:,0].values
#print('The independent features set: ')
#print(X[:5,:])
#print('The dependent variable: ')
#print(y[:5])
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 30)
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 40)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to Iris-setosa, Iris-versicolor and Iris-virginica
reversefactor = dict(zip(range(1,11),definitions))
#print(reversefactor)
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual DJ'], colnames=['Predicted DJ']))
sk_report = classification_report(
digits=6,
y_true=y_test,
y_pred=classifier.predict(X_test))
print(sk_report)
#
#
print('accuracy_score', accuracy_score(y_test, classifier.predict(X_test)))
cm = confusion_matrix(y_test, classifier.predict(X_test))
print(cm)
# save the model to disk
modelFilename = 'randomforestmodel.pkl'
if (accuracy_score(y_test, classifier.predict(X_test))*100) > 75:
joblib.dump(classifier, modelFilename)
print("Saved model to disk")
有人可以帮助我吗?谢谢