我是机器学习的新鱼。我最近遇到了一个问题,并且已经在StackOverFlow中查找了相同的主题,但仍然无法弄明白。有人可以看看吗?非常感谢!
#-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data_train = pd.read_excel('py_train.xlsx',index_col=0)
test_data = pd.read_excel('py_test.xlsx',index_col=0)
from sklearn import preprocessing
x = data_train.iloc[:,1:].as_matrix()
y = data_train.iloc[:,0:1].as_matrix()
sx = preprocessing.scale(x)
from sklearn import linear_model
clf = linear_model.LogisticRegression()
clf.fit(sx,y)
clf
代码运行良好,数据全部清除。我适合数据,如:
id rep a b c d
1 0 1 2 3 4
2 0 2 3 4 5
3 0 3 4 5 6
4 1 4 5 6 7
5 1 5 6 7 8
6 1 6 7 8 9
7 1 7 8 9 10
8 1 8 9 10 11
9 1 9 10 11 12
10 1 10 11 12 13
,下面的代码显示了一个IndexError。为什么?我该如何解决?
谢谢!
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None, n_jobs=1,
train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
train_sizes, train_scores, test_scores = learning_curve(
estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim) #ylim=y's limit
plt.xlabel(u"train set size")
plt.ylabel(u"score")
plt.gca().invert_yaxis()
plt.grid() #网格
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b") # generates a shaded region
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"train set score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"CV score")
plt.legend(loc="best")
plt.draw()
plt.gca().invert_yaxis()
plt.show()
midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff
plot_learning_curve(clf, u"learning_curve", x, y)
完整信息:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-18-0dc3d0934602> in <module>()
42 return midpoint, diff
43
---> 44 plot_learning_curve(clf, u"learning_curve", x, y)
<ipython-input-18-0dc3d0934602> in plot_learning_curve(estimator, title, x, y, ylim, cv, n_jobs, train_sizes, verbose, plot)
8
9 train_sizes, train_scores, test_scores = learning_curve(
---> 10 estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
11
12 train_scores_mean = np.mean(train_scores, axis=1)
D:\Anaconda3\lib\site-packages\sklearn\learning_curve.py in learning_curve(estimator, X, y, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, error_score)
138 X, y = indexable(X, y)
139 # Make a list since we will be iterating multiple times over the folds
--> 140 cv = list(check_cv(cv, X, y, classifier=is_classifier(estimator)))
141 scorer = check_scoring(estimator, scoring=scoring)
142
D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in check_cv(cv, X, y, classifier)
1821 if classifier:
1822 if type_of_target(y) in ['binary', 'multiclass']:
-> 1823 cv = StratifiedKFold(y, cv)
1824 else:
1825 cv = KFold(_num_samples(y), cv)
D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in __init__(self, y, n_folds, shuffle, random_state)
567 for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
568 for label, (_, test_split) in zip(unique_labels, per_label_splits):
--> 569 label_test_folds = test_folds[y == label]
570 # the test split can be too big because we used
571 # KFold(max(c, self.n_folds), self.n_folds) instead of
IndexError: too many indices for array
答案 0 :(得分:0)
逻辑回归接受,交叉验证器似乎只接受y值的数组。你似乎传递了一个矩阵
检查区别:
你传递的是:
df.iloc[:,0:1].as_matrix()
array([[0],
[1],
[2]], dtype=int64)
但使用
可能更好df.iloc[:,0].as_matrix()
array([0, 1, 2], dtype=int64)
你能试试吗?