使用数据集实现ROC曲线:ValueError:数据不是二进制,并且未指定pos_label

时间:2017-10-14 19:01:11

标签: python classification random-forest roc

我发现使用python绘制这条roc曲线有点困难。我看到使用硬编码数字的例子,但不是数据集。我一直在上面的标题中得到错误声明,但我还没有找到解决方案。我的目标是使用Pandas从数据集中提取数据并使用Random Forest对数据进行分类。从那里我想要执行我的预测并根据测试集和预测绘制ROC曲线,但是我现在卡住了。代码如下:

#Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random

import sklearn
print(sklearn.__version__)

#File Paths
INPUT_PATH = "C:/Users/DGunn/Documents/userData/allUsers/FusedTouchData/fusedTouchData_noHeaders.csv"
OUTPUT_PATH = "C:/Users/DGunn/Documents/userData/allUsers/FusedTouchData/fusedTouchData_addedHeaders_testConversion.csv"

#Headers

headers = ["UserID", "Systime", "EventTime", "ActivityID", "Pointer_count", "PointID", "ActionID", "X", "Y", "Pressure",
           "Contact_size", "Phone_orientation"]

def read_data(path):
    '''

    :param path:
    :return:
    '''

    data = pd.read_csv(path)
    return data

def get_headers(dataset):
    '''

    dataset headers
    :param dataset:
    :return:
    '''
    return dataset.columns.values

def add_headers(dataset, headers):
    '''

    Add the headers to the dataset
    :param dataset:
    :param headers:
    :return:
    '''

    dataset.columns = headers
    return dataset

def data_file_to_csv():
    '''

    :return:
    '''

    #Headers
    "headers = ['HeaderName1', 'HeaderName2']"

    #Load the dataset into Pandas data frame
    dataset = read_data(INPUT_PATH)

    #Add the headers to the loaded dataset
    dataset = add_headers(dataset, headers)

    #Save the loaded dataset into csv format
    dataset.to_csv(OUTPUT_PATH, index=False)
    print("File update completed!")

def split_dataset(dataset, train_percentage, feature_headers, target_header):
    """
    Split the dataset with train_percentage
    :param dataset:
    :param train_percentage:
    :param feature_headers:
    :param target_header:
    :return:
    """

    #Split dataset into train and test dataset
    train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header],
                                                        train_size=train_percentage)
    return train_x, test_x, train_y, test_y

def handle_missing_values(dataset, missing_values_header, missing_label):
    """

    Filter missing values from the dataset
    :param dataset:
    :param missing_values_header:
    :param missing_label:
    :return:
    """

    return dataset[dataset[missing_values_header] != missing_label]

def random_forest_classifier(features, target):
    """
    To train the random forest classifier with features and target data
    :param features:
    :param target:
    :return:
    """
    clf = RandomForestClassifier()
    clf.fit(features, target)
    return clf

def dataset_statistics(dataset):
    """
    Basic statistics of the dataset
    :param dataset:
    :return:
    """
    print(dataset.describe())

def main():
    """
    Main Function
    :return:
    """
    #Load initial dataset
    data_file_to_csv()
    #Load the csv file into pandas dataframe
    dataset = pd.read_csv(OUTPUT_PATH)
    #Get basic statistics of the loaded dataset
    dataset_statistics(dataset)
    dataset = dataset.fillna(method='ffill')
    print(np.all(np.isfinite(dataset)))

    #Filter missing values
    ##dataset = handle_missing_values(dataset, headers[11], '?')
    train_x, test_x, train_y, test_y = split_dataset(dataset, 0.4, headers[1:], headers[0])
    print(headers[1:], headers[0])

    #Train and Test dataset size details
    print("Train_x Shape:: ", train_x.shape)
    print("Train_y Shape :: ", train_y.shape)
    print("Test_x Shape :: ", test_x.shape)
    print("Test_y Shape :: ", test_y.shape)

    # Create random forest classifier instance

    trained_model = random_forest_classifier(train_x, train_y)
    print("Trained model :: ", trained_model)
    predictions = trained_model.predict(test_x)


    for i in range(0, 500):
        print("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))

    print("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
    print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
    print(" Confusion matrix ", confusion_matrix(test_y, predictions))

    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions)
    roc_auc = auc(false_positive_rate, true_positive_rate)

    plt.title('ROC Curve')
    plt.plot(false_positive_rate, true_positive_rate, 'b',
             label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1], 'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

if __name__ == "__main__":
    main()

认为你们可以帮助我吗?

提前致谢

以下是结果的完整错误和数据的快速描述:

              UserID       Systime     EventTime    ActivityID  Pointer_count  \
count  361834.000000  3.618380e+05  3.618380e+05  3.618380e+05   3.618380e+05   
mean   163572.310084  3.869149e+17  2.290281e+07  1.638691e+14   1.912458e+09   
std     27420.804226  2.327400e+20  4.654771e+09  2.745967e+13   5.768818e+11   
min    100669.000000  1.519850e+05  3.855300e+04  4.529300e+04   1.000000e+00   
25%    151985.000000  1.400000e+12  1.193756e+06  1.520000e+14   1.000000e+00   
50%    171538.000000  1.400000e+12  2.643164e+06  1.720000e+14   1.000000e+00   
75%    180679.000000  1.400000e+12  5.610611e+06  1.810000e+14   1.000000e+00   
max    186676.000000  1.400000e+23  1.400000e+12  1.870000e+14   1.870000e+14   

             PointID       ActionID              X              Y  \
count  361838.000000  361838.000000  361838.000000  361838.000000   
mean        0.129821       1.810238     566.241327     694.303258   
std         0.342847       0.756644     256.634365     453.203547   
min         0.000000       0.000000       0.000000     -71.000000   
25%         0.000000       2.000000     360.558122     261.000000   
50%         0.000000       2.000000     579.000000     617.102750   
75%         0.000000       2.000000     764.032800    1107.000000   
max         3.000000       6.000000    1062.822500    1840.000000   

            Pressure   Contact_size  Phone_orientation  
count  361838.000000  361838.000000       3.618380e+05  
mean        1.007636       0.024858       2.384341e-07  
std         2.817316       0.004948       7.215543e-05  
min         1.000000       0.011765       0.000000e+00  
25%         1.000000       0.021569       0.000000e+00  
50%         1.000000       0.023529       0.000000e+00  
75%         1.000000       0.027451       0.000000e+00  
max      1357.000000       1.000000       2.549020e-02  
True
C:\Users\DGunn\Anaconda3\envs\tensorflow35\lib\site-packages\sklearn\model_selection\_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
  FutureWarning)
['Systime', 'EventTime', 'ActivityID', 'Pointer_count', 'PointID', 'ActionID', 'X', 'Y', 'Pressure', 'Contact_size', 'Phone_orientation'] UserID
144735 217103
(361838,)
Train_x Shape::  (144735, 11)
Train_y Shape ::  (144735,)
Test_x Shape ::  (217103, 11)
Test_y Shape ::  (217103,)
Trained model ::  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Train Accuracy ::  1.0
Test Accuracy  ::  1.0
Traceback (most recent call last):
  File "C:/Users/DGunn/PycharmProjects/activeCA/RandomForest/RandomF_fusedTouchDataClassification.py", line 199, in <module>
    main()
 Confusion matrix  [[28959     0     0     0     0]
  File "C:/Users/DGunn/PycharmProjects/activeCA/RandomForest/RandomF_fusedTouchDataClassification.py", line 183, in main fpr, tpr, thresholds = roc_curve(test_y, predictions)
 [    0 42594     0     0     0]  
 [    0     0 54374     0     0]
 [    0     0     0 37516     0]
 [    0     0     0     0 53660]]
  File "C:\Users\DGunn\Anaconda3\envs\tensorflow35\lib\site-packages\sklearn\metrics\ranking.py", line 510, in roc_curve
    y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)
  File "C:\Users\DGunn\Anaconda3\envs\tensorflow35\lib\site-packages\sklearn\metrics\ranking.py", line 319, in _binary_clf_curve
    raise ValueError("Data is not binary and pos_label is not specified")
ValueError: Data is not binary and pos_label is not specified

0 个答案:

没有答案