在sklearn中使用数据集

时间:2014-02-05 07:30:55

标签: python numpy scipy scikit-learn

我有一个数据集,格式为.csv

id,interaction_flag,x_coordinate,y_coordinate,z_coordinate,hydrophobicity_kd,hydrophobicity_ww,hydrophobicity_hh,surface_tension,charge_cooh,charge_nh3,charge_r,alpha_helix,beta_strand,turn,van_der_walls,mol_wt,solublity  
229810,1,-33.8675148907451,-110.273691995647,100.021824089754,0.129381338742408,0.129381338742408,0.129381338742408,57.9996957403639,2.20539553752535,9.55985801217038,4.47146044624688,1.08064908722114,1.20135902636915,0.611653144016251,145.232251521298,107.951643002026,21.5344036511141        
229811,1,-26.9070290467923,-117.172163712053,106.980243932766,0.922048681541592,0.922048681541592,0.922048681541592,58.5383367139972,2.03983772819472,9.23210953346856,1.58401622717997,0.84178498985806,1.0387626774848,0.921703853955354,124.73630831643,84.1570182555755,10.7648600405665

我正在尝试使用以下链接从此数据中获取接收器操作特性(ROC):http://scikit-learn.org/0.11/auto_examples/plot_roc.html

我的目标是interaction_flag列,test是interaction_flag之后的所有列。 但是,我的程序继续在永无止境的状态下运行。

当我运行该链接中给出的测试示例时,它会在一瞬间运行。

任何人都可以让我知道我在做什么错吗?或者我是否需要像iris一样加载我的数据?

我的代码:

import numpy as np
import pylab as pl
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc

training = 'dataset/training_5000_col.csv'
test = 'dataset/test_5000_col.csv'

random_state = np.random.RandomState(0)

# Import some data to play with
#iris = datasets.load_iris()
#X = iris.data
#y = iris.target
X = []
y = []
for line in open(training):
    z = line.rstrip().split(',')
y.append(int(z[2]))
tmp = []
for a in range(5, 15):
    tmp.append(float(z[a]))
X.append(tmp)
X_train = np.array(X)
y_train = np.array(y)



X1 = []
y1 = []
for line in open(test):
z = line.rstrip().split(',')
y1.append(int(z[2]))
tmp = []
for a in range(5, 15):
    tmp.append(float(z[a]))
X1.append(tmp)
X_test = np.array(X1)
y_test = np.array(y1)

# Run classifier
classifier = svm.SVC(kernel='linear', probability=True)
probas_ = classifier.fit(X_train, y_train).predict_proba(X_test)

# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
print "y_test : ", y_test
print "fpr : ", fpr
print "tpr : ", tpr
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()

我的.csv文件位于:http://pastebin.com/iet5xQW2 我将如何用这个.csv

绘制roc

1 个答案:

答案 0 :(得分:2)

您需要有两个不同的标签才能绘制ROC曲线。 如果我在数据中添加了一些0标签,以下示例适用于我。我使用pandas来读取数据,其余与sklearn示例相同。

此外,您需要将数据集拆分为训练和测试集,以在测试集上绘制ROC曲线。

import pandas as pd
import numpy as np
from scipy import interp
import pylab as pl

from sklearn import svm
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold




def data(filename):
    X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True, low_memory = False)

    X = np.asarray(X)

    data = X[:,2:]
    labels = X[:,1]
    print np.unique(labels)

    return data, labels




filename = '../data/sodata.csv'
X, y = data(filename)

###############################################################################
# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(y, n_folds=6)
classifier = svm.SVC(kernel='linear', probability=True, random_state=0)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
pl.plot(mean_fpr, mean_tpr, 'k--',
        label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

pl.xlim([-0.05, 1.05])
pl.ylim([-0.05, 1.05])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()