我发现使用python绘制这条roc曲线有点困难。我看到使用硬编码数字的例子,但不是数据集。我一直在上面的标题中得到错误声明,但我还没有找到解决方案。我的目标是使用Pandas从数据集中提取数据并使用Random Forest对数据进行分类。从那里我想要执行我的预测并根据测试集和预测绘制ROC曲线,但是我现在卡住了。代码如下:
#Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random
import sklearn
print(sklearn.__version__)
#File Paths
INPUT_PATH = "C:/Users/DGunn/Documents/userData/allUsers/FusedTouchData/fusedTouchData_noHeaders.csv"
OUTPUT_PATH = "C:/Users/DGunn/Documents/userData/allUsers/FusedTouchData/fusedTouchData_addedHeaders_testConversion.csv"
#Headers
headers = ["UserID", "Systime", "EventTime", "ActivityID", "Pointer_count", "PointID", "ActionID", "X", "Y", "Pressure",
"Contact_size", "Phone_orientation"]
def read_data(path):
'''
:param path:
:return:
'''
data = pd.read_csv(path)
return data
def get_headers(dataset):
'''
dataset headers
:param dataset:
:return:
'''
return dataset.columns.values
def add_headers(dataset, headers):
'''
Add the headers to the dataset
:param dataset:
:param headers:
:return:
'''
dataset.columns = headers
return dataset
def data_file_to_csv():
'''
:return:
'''
#Headers
"headers = ['HeaderName1', 'HeaderName2']"
#Load the dataset into Pandas data frame
dataset = read_data(INPUT_PATH)
#Add the headers to the loaded dataset
dataset = add_headers(dataset, headers)
#Save the loaded dataset into csv format
dataset.to_csv(OUTPUT_PATH, index=False)
print("File update completed!")
def split_dataset(dataset, train_percentage, feature_headers, target_header):
"""
Split the dataset with train_percentage
:param dataset:
:param train_percentage:
:param feature_headers:
:param target_header:
:return:
"""
#Split dataset into train and test dataset
train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header],
train_size=train_percentage)
return train_x, test_x, train_y, test_y
def handle_missing_values(dataset, missing_values_header, missing_label):
"""
Filter missing values from the dataset
:param dataset:
:param missing_values_header:
:param missing_label:
:return:
"""
return dataset[dataset[missing_values_header] != missing_label]
def random_forest_classifier(features, target):
"""
To train the random forest classifier with features and target data
:param features:
:param target:
:return:
"""
clf = RandomForestClassifier()
clf.fit(features, target)
return clf
def dataset_statistics(dataset):
"""
Basic statistics of the dataset
:param dataset:
:return:
"""
print(dataset.describe())
def main():
"""
Main Function
:return:
"""
#Load initial dataset
data_file_to_csv()
#Load the csv file into pandas dataframe
dataset = pd.read_csv(OUTPUT_PATH)
#Get basic statistics of the loaded dataset
dataset_statistics(dataset)
dataset = dataset.fillna(method='ffill')
print(np.all(np.isfinite(dataset)))
#Filter missing values
##dataset = handle_missing_values(dataset, headers[11], '?')
train_x, test_x, train_y, test_y = split_dataset(dataset, 0.4, headers[1:], headers[0])
print(headers[1:], headers[0])
#Train and Test dataset size details
print("Train_x Shape:: ", train_x.shape)
print("Train_y Shape :: ", train_y.shape)
print("Test_x Shape :: ", test_x.shape)
print("Test_y Shape :: ", test_y.shape)
# Create random forest classifier instance
trained_model = random_forest_classifier(train_x, train_y)
print("Trained model :: ", trained_model)
predictions = trained_model.predict(test_x)
for i in range(0, 500):
print("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))
print("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
print("Test Accuracy :: ", accuracy_score(test_y, predictions))
print(" Confusion matrix ", confusion_matrix(test_y, predictions))
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('ROC Curve')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1], 'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
if __name__ == "__main__":
main()
认为你们可以帮助我吗?
提前致谢
以下是结果的完整错误和数据的快速描述:
UserID Systime EventTime ActivityID Pointer_count \
count 361834.000000 3.618380e+05 3.618380e+05 3.618380e+05 3.618380e+05
mean 163572.310084 3.869149e+17 2.290281e+07 1.638691e+14 1.912458e+09
std 27420.804226 2.327400e+20 4.654771e+09 2.745967e+13 5.768818e+11
min 100669.000000 1.519850e+05 3.855300e+04 4.529300e+04 1.000000e+00
25% 151985.000000 1.400000e+12 1.193756e+06 1.520000e+14 1.000000e+00
50% 171538.000000 1.400000e+12 2.643164e+06 1.720000e+14 1.000000e+00
75% 180679.000000 1.400000e+12 5.610611e+06 1.810000e+14 1.000000e+00
max 186676.000000 1.400000e+23 1.400000e+12 1.870000e+14 1.870000e+14
PointID ActionID X Y \
count 361838.000000 361838.000000 361838.000000 361838.000000
mean 0.129821 1.810238 566.241327 694.303258
std 0.342847 0.756644 256.634365 453.203547
min 0.000000 0.000000 0.000000 -71.000000
25% 0.000000 2.000000 360.558122 261.000000
50% 0.000000 2.000000 579.000000 617.102750
75% 0.000000 2.000000 764.032800 1107.000000
max 3.000000 6.000000 1062.822500 1840.000000
Pressure Contact_size Phone_orientation
count 361838.000000 361838.000000 3.618380e+05
mean 1.007636 0.024858 2.384341e-07
std 2.817316 0.004948 7.215543e-05
min 1.000000 0.011765 0.000000e+00
25% 1.000000 0.021569 0.000000e+00
50% 1.000000 0.023529 0.000000e+00
75% 1.000000 0.027451 0.000000e+00
max 1357.000000 1.000000 2.549020e-02
True
C:\Users\DGunn\Anaconda3\envs\tensorflow35\lib\site-packages\sklearn\model_selection\_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
FutureWarning)
['Systime', 'EventTime', 'ActivityID', 'Pointer_count', 'PointID', 'ActionID', 'X', 'Y', 'Pressure', 'Contact_size', 'Phone_orientation'] UserID
144735 217103
(361838,)
Train_x Shape:: (144735, 11)
Train_y Shape :: (144735,)
Test_x Shape :: (217103, 11)
Test_y Shape :: (217103,)
Trained model :: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
Train Accuracy :: 1.0
Test Accuracy :: 1.0
Traceback (most recent call last):
File "C:/Users/DGunn/PycharmProjects/activeCA/RandomForest/RandomF_fusedTouchDataClassification.py", line 199, in <module>
main()
Confusion matrix [[28959 0 0 0 0]
File "C:/Users/DGunn/PycharmProjects/activeCA/RandomForest/RandomF_fusedTouchDataClassification.py", line 183, in main fpr, tpr, thresholds = roc_curve(test_y, predictions)
[ 0 42594 0 0 0]
[ 0 0 54374 0 0]
[ 0 0 0 37516 0]
[ 0 0 0 0 53660]]
File "C:\Users\DGunn\Anaconda3\envs\tensorflow35\lib\site-packages\sklearn\metrics\ranking.py", line 510, in roc_curve
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)
File "C:\Users\DGunn\Anaconda3\envs\tensorflow35\lib\site-packages\sklearn\metrics\ranking.py", line 319, in _binary_clf_curve
raise ValueError("Data is not binary and pos_label is not specified")
ValueError: Data is not binary and pos_label is not specified