Question

我有两个相同标签数据的数据框 df_Lib（5 行 × 97 列）和 df_Entre（5 行 × 97 列）。 df_Lib 具有名为 Liberal 的目标变量，而 df_Entre 具有 Entrepreneurship。我使用随机森林分类器来预测给定数据是在 Liberal 或 Entrepreneurship 之下，然后绘制 ROC 和 AUC 曲线。数据经过清洗和预处理。到目前为止编写了以下代码：

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    df_Lib=pd.read_csv('Data_A.csv')
    
    
    df_Lib=df_Lib[['subreddit','i', 'you','negate','Sixltr','affect', 'time','we','focuspresent','tentat']]      # selected features common to both datasets
    
    df_Entre=pd.read_csv('Data_B.csv')
    df_Entre.head()
    
    df_Entre=df_Entre[['subreddit','i', 'you','negate','Sixltr','affect', 'time','we','focuspresent','tentat']]  #selected features common to both datasets
    df_Entre.head()
    
    df=pd.concat([df_Lib,df_Entre])   # combine these two datasets
    
    Reddit=pd.get_dummies(df['subreddit'],drop_first=False)  # since target variable is Subreddit (categorical variable), i created dummy values.
    
    df2=pd.concat([df,Reddit],axis=1)     # The Reddit shows o for Entrepreneur and 1 for liberals
    df2.head()
        
    X=df2.drop(['subreddit','Entrepreneur'],axis=1)
    y=df2['Entrepreneur']
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    # # Standard scale improves the accuracy
    from sklearn.preprocessing import StandardScaler
    sc= StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.fit_transform(X_test)
    
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import make_classification
    
    clf=RandomForestClassifier(n_estimators=100)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    
    lr_probs = clf.predict_proba(X_test)
    lr_probs = lr_probs[:, 1]
    
    
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve
    from matplotlib import pyplot
    
    ns_probs = [0 for _ in range(len(y_test))]
    
    
    ns_auc = roc_auc_score(y_test, ns_probs)
    lr_auc = roc_auc_score(y_test, lr_probs)
    
    
    # calculate roc curves
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
    # plot the roc curve for the model
    pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Random_Forest')
    # axis labels
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    # show the legend
    pyplot.legend()
    # show the plot
    pyplot.show()

这给了我奇怪的曲线请忽略图形标签。它的错字错误。我不知道我在分类器或 ROC 中哪里出错了。请帮忙。提前致谢

随机森林分类器和 ROC 和 AUC 曲线误差

0 个答案: