我尝试使用以下代码绘制火车和测试学习学习曲线:
import numpy as np
from sklearn import cross_validation
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.linear_model as lm
import pandas as pd
from sklearn.learning_curve import learning_curve
def main():
print("loading data..")
train_data = list(np.array(pd.read_table('train.tsv'))[:, 2])
test_data = list(np.array(pd.read_table('test.tsv'))[:, 2])
tr = np.array(pd.read_table('train.tsv'))
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode',analyzer='word',
token_pattern=r'\w{1,}',ngram_rang(1,2),
use_idf=1, smooth_idf=1, sublinear_tf=1)
rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001,
C=1, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
y=tr[:,-1].astype(int)
X_all = train_data + test_data
len_train = len(train_data)
print("fitting pipeline")
tfv.fit(X_all)
print("transforming data")
X_all = tfv.transform(X_all)
X = X_all[:len_train]
X_test = X_all[len_train:]
print("20 Fold CV Score: " +
str(np.mean(cross_validation.cross_val_score(rd, X, y, cv=20,
scoring='roc_auc'))))
print("training on full data")
rd.fit(X,y)
pred = rd.predict_proba(X_test)[:, 1]
test_file = pd.read_csv('test.tsv', sep="\t", na_values=['?'],index_col=1)
pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label'])
pred_df.to_csv('benchmark.csv')
print("submission file created..")
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(1,7000,10)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y,
cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-
validation score")
plt.legend(loc="best")
return plt
X,Y = y,pred
title = "Learning Curves (tf-idf)"
cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20,
random_state=0)
estimator = TfidfVectorizer()
plot_learning_curve(estimator, title, X, Y, ylim=(0.1, 1.01), cv=cv,
n_jobs=4)
title = "Learning Curves (lr)"
cv = cross_validation.ShuffleSplit(pred_df, n_iter=10, test_size=0.20,
random_state=0)
estimator = lm()
plot_learning_curve(estimator, title, X, Y, (0.1, 1.01), cv=cv, n_jobs=4)
plt.show()
if __name__ == "__main__":
main()
它出现以下错误:
Traceback (most recent call last):
File "<ipython-input-17-fe9e40bbce16>", line 1, in <module>
runfile('C:/Users/Maitri/Documents/Python Scripts/first.py', wdir='C:/Users/Maitri/Documents/Python Scripts')
File "C:\Users\Maitri\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
execfile(filename, namespace)
File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 136, in <module>
main()
File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 122, in main
cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20, random_state=0)
File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 771, in __init__
train_size)
File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 922, in _validate_shuffle_split
n_test = ceil(test_size * n)
TypeError: a float is required
有没有更好的方法来绘制预测结果的图表?
答案 0 :(得分:0)
您收到的错误的回溯显示问题的根源。
File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 122, in main cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20, random_state=0)
File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 771, in init train_size)
File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 922, in _validate_shuffle_split n_test = ceil(test_size * n)
TypeError: a float is required
sklearn
包中引发了错误,但最终源自脚本中的line 122
。在这里,您传递包含数据的Pandas DataFrame。给出TypeError: a float is required
的错误表示ShuffleSplit
期待float
类型。
第一步是检查您是否正确加载输入文件,如下所示:
test_file = pd.read_csv('test.tsv', sep="\t", na_values=['?'],index_col=1)
pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label'])
如果数据加载正常,但类型不是浮动,您可以使用或astype
numpy或as_dtype
将数据转换为正确的类型对于熊猫。 e.g。
pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label']).as_dtype(np.float)
# ...or...
pred_df = pd.DataFrame(pred.astype(np.float), index=test_file.index, columns=['label'])
答案 1 :(得分:0)
ShuffleSplit的第一个参数是样本数n,您可以从文档中看到:http://scikit-learn.org/dev/modules/generated/sklearn.cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit 你传递了一个数据帧。