我希望监控多类Gradient Boosting分类器训练过程中的损失,以了解是否发生过度拟合。这是我的代码:
%matplotlib inline
import numpy as np
#import matplotlib.pyplot as plt
import matplotlib.pylab as plt
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
iris = datasets.load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
n_est = 100
clf = GradientBoostingClassifier(n_estimators=n_est, max_depth=3, random_state=2)
clf.fit(X_train, y_train)
test_score = np.empty(len(clf.estimators_))
for i, pred in enumerate(clf.staged_predict(X_test)):
test_score[i] = clf.loss_(y_test, pred)
plt.plot(np.arange(n_est) + 1, test_score, label='Test')
plt.plot(np.arange(n_est) + 1, clf.train_score_, label='Train')
plt.show()
但是我收到以下值错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-33-27194f883893> in <module>()
22 test_score = np.empty(len(clf.estimators_))
23 for i, pred in enumerate(clf.staged_predict(X_test)):
---> 24 test_score[i] = clf.loss_(y_test, pred)
25 plt.plot(np.arange(n_est) + 1, test_score, label='Test')
26 plt.plot(np.arange(n_est) + 1, clf.train_score_, label='Train')
C:\Documents and Settings\Philippe\Anaconda\lib\site-packages\sklearn\ensemble\gradient_boosting.pyc in __call__(self, y, pred)
396 Y[:, k] = y == k
397
--> 398 return np.sum(-1 * (Y * pred).sum(axis=1) +
399 logsumexp(pred, axis=1))
400
ValueError: operands could not be broadcast together with shapes (45,3) (45)
我知道如果我使用GradientBoostingRegressor这个代码工作正常,但是我无法弄清楚如何使用GradientBoostingClassifier这样的多类分类器。谢谢你的帮助。
答案 0 :(得分:4)
似乎loss_
需要一个形状n_samples, k
的数组,而staged_predict
则返回一个形状[n_samples]
的数组(根据文档)。您可能希望将staged_predict_proba
或staged_decision_function
的结果传入loss_
。
我认为你测量火车和测试装置的损失如下:
for i, pred in enumerate(clf.staged_decision_function(X_test)):
test_score[i] = clf.loss_(y_test, pred)
for i, pred in enumerate(clf.staged_decision_function(X_train)):
train_score[i] = clf.loss_(y_train, pred)
plot(test_score)
plot(train_score)
legend(['test score', 'train score'])
请注意我第二次拨打loss_
我已经通过火车组。输出看起来像我期望的那样:
答案 1 :(得分:0)
你可以使用这样的东西。 这是一个使用knn的例子。
# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
# Loop over different values of k
for i, k in enumerate(neighbors):
# Setup a k-NN Classifier with k neighbors: knn
knn = KNeighborsClassifier(n_neighbors=k)
# Fit the classifier to the training data
knn.fit(X_train, y_train)
#Compute accuracy on the training set
train_accuracy[i] = knn.score(X_train, y_train)
#Compute accuracy on the testing set
test_accuracy[i] = knn.score(X_test, y_test)
# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()