我有两个相同标签数据的数据框 df_Lib
(5 行 × 97 列)和 df_Entre
(5 行 × 97 列)。 df_Lib
具有名为 Liberal
的目标变量,而 df_Entre
具有 Entrepreneurship
。我使用随机森林分类器来预测给定数据是在 Liberal
或 Entrepreneurship
之下,然后绘制 ROC 和 AUC 曲线。数据经过清洗和预处理。到目前为止编写了以下代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df_Lib=pd.read_csv('Data_A.csv')
df_Lib=df_Lib[['subreddit','i', 'you','negate','Sixltr','affect', 'time','we','focuspresent','tentat']] # selected features common to both datasets
df_Entre=pd.read_csv('Data_B.csv')
df_Entre.head()
df_Entre=df_Entre[['subreddit','i', 'you','negate','Sixltr','affect', 'time','we','focuspresent','tentat']] #selected features common to both datasets
df_Entre.head()
df=pd.concat([df_Lib,df_Entre]) # combine these two datasets
Reddit=pd.get_dummies(df['subreddit'],drop_first=False) # since target variable is Subreddit (categorical variable), i created dummy values.
df2=pd.concat([df,Reddit],axis=1) # The Reddit shows o for Entrepreneur and 1 for liberals
df2.head()
X=df2.drop(['subreddit','Entrepreneur'],axis=1)
y=df2['Entrepreneur']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# # Standard scale improves the accuracy
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
lr_probs = clf.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from matplotlib import pyplot
ns_probs = [0 for _ in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Random_Forest')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()