使用matplotlib绘制图形

时间:2015-02-23 10:59:39

标签: python matplotlib scikit-learn

我尝试使用以下代码绘制火车和测试学习学习曲线:

import numpy as np
from sklearn import cross_validation
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.linear_model as lm
import pandas as pd
from sklearn.learning_curve import learning_curve

def main():
  print("loading data..")
  train_data = list(np.array(pd.read_table('train.tsv'))[:, 2])
  test_data = list(np.array(pd.read_table('test.tsv'))[:, 2])
  tr = np.array(pd.read_table('train.tsv'))
  tfv = TfidfVectorizer(min_df=3,  max_features=None,
   strip_accents='unicode',analyzer='word',
   token_pattern=r'\w{1,}',ngram_rang(1,2),
                      use_idf=1, smooth_idf=1, sublinear_tf=1)

  rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                           C=1, fit_intercept=True, intercept_scaling=1.0,
                           class_weight=None, random_state=None)
  y=tr[:,-1].astype(int)
  X_all = train_data + test_data
  len_train = len(train_data)

  print("fitting pipeline")
  tfv.fit(X_all)
  print("transforming data")
  X_all = tfv.transform(X_all)

  X = X_all[:len_train]
  X_test = X_all[len_train:]

  print("20 Fold CV Score: " +                 
  str(np.mean(cross_validation.cross_val_score(rd, X, y, cv=20,
   scoring='roc_auc'))))

  print("training on full data")
  rd.fit(X,y)
  pred = rd.predict_proba(X_test)[:, 1]
  test_file = pd.read_csv('test.tsv', sep="\t", na_values=['?'],index_col=1)
  pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label'])   
  pred_df.to_csv('benchmark.csv')
  print("submission file created..")

  def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,        
   n_jobs=1, train_sizes=np.linspace(1,7000,10)):
  """
   Generate a simple plot of the test and traning learning curve.

   Parameters
   ----------
   estimator : object type that implements the "fit" and "predict" methods
    An object of that type which is cloned for each validation.

   title : string
    Title for the chart.

   X : array-like, shape (n_samples, n_features)
    Training vector, where n_samples is the number of samples and
    n_features is the number of features.

   y : array-like, shape (n_samples) or (n_samples, n_features), optional
    Target relative to X for classification or regression;

   ylim : tuple, shape (ymin, ymax), optional
    Defines minimum and maximum yvalues plotted.

   cv : integer, cross-validation generator, optional

   n_jobs : integer, optional
    Number of jobs to run in parallel (default 1).
  """
    plt.figure()
    plt.title(title)
    if ylim is not None:
      plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y,      
      cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,     
     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,  
     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", 
      label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-
     validation score")

    plt.legend(loc="best")
    return plt

  X,Y = y,pred


  title = "Learning Curves (tf-idf)"

  cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20,      
   random_state=0)

  estimator = TfidfVectorizer()
  plot_learning_curve(estimator, title, X, Y, ylim=(0.1, 1.01), cv=cv, 
    n_jobs=4)

  title = "Learning Curves (lr)"

  cv = cross_validation.ShuffleSplit(pred_df, n_iter=10, test_size=0.20, 
    random_state=0)
  estimator = lm()
  plot_learning_curve(estimator, title, X, Y, (0.1, 1.01), cv=cv, n_jobs=4)

  plt.show()    

if __name__ == "__main__":
  main()

它出现以下错误:

Traceback (most recent call last):

  File "<ipython-input-17-fe9e40bbce16>", line 1, in <module>
    runfile('C:/Users/Maitri/Documents/Python Scripts/first.py', wdir='C:/Users/Maitri/Documents/Python Scripts')

  File "C:\Users\Maitri\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
    execfile(filename, namespace)

  File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 136, in <module>
    main()

  File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 122, in main
    cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20, random_state=0)

  File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 771, in __init__
    train_size)

  File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 922, in _validate_shuffle_split
    n_test = ceil(test_size * n)

TypeError: a float is required

有没有更好的方法来绘制预测结果的图表?

2 个答案:

答案 0 :(得分:0)

您收到的错误的回溯显示问题的根源。

File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 122, in main cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20, random_state=0)

File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 771, in init train_size)

File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 922, in _validate_shuffle_split n_test = ceil(test_size * n)

TypeError: a float is required

sklearn包中引发了错误,但最终源自脚本中的line 122。在这里,您传递包含数据的Pandas DataFrame。给出TypeError: a float is required的错误表示ShuffleSplit期待float类型。

第一步是检查您是否正确加载输入文件,如下所示:

test_file = pd.read_csv('test.tsv', sep="\t", na_values=['?'],index_col=1)
pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label'])   

如果数据加载正常,但类型不是浮动,您可以使用或astype numpy或as_dtype将数据转换为正确的类型对于熊猫。 e.g。

pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label']).as_dtype(np.float)

# ...or...

pred_df = pd.DataFrame(pred.astype(np.float), index=test_file.index, columns=['label'])

答案 1 :(得分:0)

ShuffleSplit的第一个参数是样本数n,您可以从文档中看到:http://scikit-learn.org/dev/modules/generated/sklearn.cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit 你传递了一个数据帧。