我有一个1-d类标签数组。如下(例如):
Y
array([16, 34, 1, 1, 35, 1, 1, 16, 16, 16, 16, 16, 16, 34, 7, 21, 30,
16, 16, 21, 21, 20, 20, 16, 23, 23, 4, 23, 21, 16, 20, 33, 16, 16,
16, 34, 21, 34, 34, 1, 1, 16, 16, 34, 23, 34, 16, 34, 23, 1, 20,
1, 20, 21, 21, 35, 36, 16, 16, 16, 33, 16, 21, 16, 16, 16, 36, 7,
25, 16, 16, 16, 34, 33, 20, 20, 16, 20, 16, 34, 36, 20, 20, 16, 13,
16, 20, 20, 21, 1, 20, 20, 16, 16, 33, 16, 16, 21, 35, 20, 16, 16
没有。记录是~21K。
每个唯一值表示一个对象的类。具有37个独特类别的多级设置。
现在我将这些标签二进制化,以便我可以在One Vs All Multi-class设置中使用它。
我这样做:
Y=label_binarize(Y, classes=range(len(class_mapping)))
这里class_mapping是文本类名称到数字类值映射的字典。长度给了我总数。因此我将类的值从1-D数组二进制到n-d数组。
现在,如果我想从同一个n-d阵列回到我原来的1-d阵列,我该怎么做?
编辑:执行随机森林的代码
def RF(k_fold,train_X,train_Y):
"""Method to implement Multi-class RandomForest"""
from sklearn.ensemble import RandomForestClassifier
scores_rf = []
for train_indices, test_indices in k_fold:
train_X_cv = train_X[train_indices]
train_Y_cv= train_Y[train_indices]
test_X_cv = train_X[test_indices]
test_Y_cv= train_Y[test_indices]
rf=RandomForestClassifier(n_estimators=150,criterion='entropy')
scores_rf.append(rf.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))
print("The mean accuracy of Random Forests on CV data is:", np.mean(scores_rf))
return rf
def test_performance(test_X,test_Y,classifier,name):
"""This method checks the performance of each algorithm on test data."""
from sklearn import metrics
# For SGD
print ("The accuracy of "+ name + " on test data is:",classifier.score(test_X,test_Y))
print ('Classification Metrics for '+ name+ ' :')
print metrics.classification_report(test_Y, classifier.predict(test_X))
print "Confusion matrix"
print metrics.confusion_matrix(test_Y, classifier.predict(test_X))
def plot_ROC(test_X,test_Y,classifier,name):
""" This functions plots the ROC curve of the classifier"""
fig=plt.figure(figsize=(15,9))
from sklearn.metrics import roc_curve, auc
if name in ['rf','sgd_svm']:
Y_score=classifier.predict_proba(test_X).T[1]
else:
Y_score = classifier.decision_function(test_X)
n_classes = len(class_mapping)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(test_Y[:, i], Y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(test_Y.ravel(), Y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
##############################################################################
# Plot ROC curves for the multiclass problem
# Compute macro-average ROC curve and ROC area
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
linewidth=2)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
linewidth=2)
for i in range(n_classes):
plt.plot(fpr[i], tpr[i], label='AUC class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-Class ROC Curve of '+name)
plt.legend(loc="lower right")
要绘制多类ROC,我需要对标签进行二值化,但要在test_performance函数中使用分类指标,我需要将标签作为一维数组返回。