Question

我有一个1-d类标签数组。如下（例如）：

Y
array([16, 34,  1,  1, 35,  1,  1, 16, 16, 16, 16, 16, 16, 34,  7, 21, 30,
       16, 16, 21, 21, 20, 20, 16, 23, 23,  4, 23, 21, 16, 20, 33, 16, 16,
       16, 34, 21, 34, 34,  1,  1, 16, 16, 34, 23, 34, 16, 34, 23,  1, 20,
        1, 20, 21, 21, 35, 36, 16, 16, 16, 33, 16, 21, 16, 16, 16, 36,  7,
       25, 16, 16, 16, 34, 33, 20, 20, 16, 20, 16, 34, 36, 20, 20, 16, 13,
       16, 20, 20, 21,  1, 20, 20, 16, 16, 33, 16, 16, 21, 35, 20, 16, 16

没有。记录是~21K。

每个唯一值表示一个对象的类。具有37个独特类别的多级设置。

现在我将这些标签二进制化，以便我可以在One Vs All Multi-class设置中使用它。

我这样做：

Y=label_binarize(Y, classes=range(len(class_mapping)))

这里class_mapping是文本类名称到数字类值映射的字典。长度给了我总数。因此我将类的值从1-D数组二进制到n-d数组。

现在，如果我想从同一个n-d阵列回到我原来的1-d阵列，我该怎么做？

编辑：执行随机森林的代码

def RF(k_fold,train_X,train_Y):
        """Method to implement Multi-class RandomForest"""

        from sklearn.ensemble import RandomForestClassifier
        scores_rf = []

        for train_indices, test_indices in k_fold:
            train_X_cv = train_X[train_indices]
            train_Y_cv= train_Y[train_indices]

            test_X_cv = train_X[test_indices]
            test_Y_cv= train_Y[test_indices]

            rf=RandomForestClassifier(n_estimators=150,criterion='entropy')
            scores_rf.append(rf.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))

        print("The mean accuracy of Random Forests on CV data is:", np.mean(scores_rf))

        return rf


def test_performance(test_X,test_Y,classifier,name):
        """This method checks the performance of each algorithm on test data."""

        from sklearn import metrics

        # For SGD
        print ("The accuracy of "+ name + " on test data is:",classifier.score(test_X,test_Y))
        print ('Classification Metrics for '+ name+ ' :')
        print metrics.classification_report(test_Y, classifier.predict(test_X))
        print "Confusion matrix"
        print metrics.confusion_matrix(test_Y, classifier.predict(test_X))   



def plot_ROC(test_X,test_Y,classifier,name):
    """ This functions plots the ROC curve of the classifier"""

    fig=plt.figure(figsize=(15,9))

    from sklearn.metrics import roc_curve, auc
    if name in ['rf','sgd_svm']:
        Y_score=classifier.predict_proba(test_X).T[1]
    else:
        Y_score = classifier.decision_function(test_X)


    n_classes = len(class_mapping)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(test_Y[:, i], Y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(test_Y.ravel(), Y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])


    ##############################################################################
    # Plot ROC curves for the multiclass problem

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         linewidth=2)

    plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         linewidth=2)

    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label='AUC class {0} (area = {1:0.2f})'
                                   ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multi-Class ROC Curve of '+name)
    plt.legend(loc="lower right")

要绘制多类ROC，我需要对标签进行二值化，但要在test_performance函数中使用分类指标，我需要将标签作为一维数组返回。

将ndarrary转换回1-d数组

0 个答案: