我想将套索与sklearn中的其他分类器进行比较。我有一个二元结果向量y。我通常计算一个向量概率,其中包含每个输入点的预测概率,其中1为表型,然后为这两个向量生成ROC曲线。但是如何计算套索分类器的概率呢?没有方法predict_proba。
对于其他分类器,此代码有效:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import datasets
from sklearn.cross_validation import LeaveOneOut
import pandas as pd
from sklearn import metrics
#loading a toy dataset
iris = datasets.load_iris()
X = iris.data
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
y = iris.target
X, y = X[y != 2], y[y != 2]
classifiers = [
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
LogisticRegression(),
]
classifierNames=[ "Random Forests", "Logistic Regression" ]
for clf in classifiers:
print (clf)
loo = LeaveOneOut(len(y))
probas=[]
for train, test in loo:
probas.append ( clf.fit(X[train], y[train]).predict_proba(X[test])[0][1])
#probas is a vector that contains the probability of getting phenotype 1
#Then we just need to use our auc roc function for plotting.
dfphenotypes = pd.DataFrame(y)的 dfpredicted = pd.DataFrame(probas)
#probas contains the probability of getting phenotype 1
#then we just need to use our auc roc function.
roc_auc=metrics.roc_auc_score(dfphenotypes, dfpredicted)
fpr, tpr, thresholds=metrics.roc_curve(dfphenotypes, dfpredicted)
# Plot ROC curve
plt.plot(fpr, tpr, '--', label=classifierNames[i]+' (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.figure(num=1, figsize=(30,40))
print("auc =", roc_auc)