python sklearn:IndexError:'数组索引太多'

时间:2017-08-18 06:56:31

标签: python pandas scikit-learn

我是机器学习的新鱼。我最近遇到了一个问题,并且已经在StackOverFlow中查找了相同的主题,但仍然无法弄明白。有人可以看看吗?非常感谢!

#-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_train = pd.read_excel('py_train.xlsx',index_col=0)
test_data = pd.read_excel('py_test.xlsx',index_col=0)


from sklearn import preprocessing

x = data_train.iloc[:,1:].as_matrix()
y = data_train.iloc[:,0:1].as_matrix()

sx = preprocessing.scale(x)

from sklearn import linear_model
clf = linear_model.LogisticRegression()
clf.fit(sx,y)

clf

代码运行良好,数据全部清除。我适合数据,如:

id  rep a   b   c   d
1   0   1   2   3   4
2   0   2   3   4   5
3   0   3   4   5   6
4   1   4   5   6   7
5   1   5   6   7   8
6   1   6   7   8   9
7   1   7   8   9   10
8   1   8   9   10  11
9   1   9   10  11  12
10  1   10  11  12  13

,下面的代码显示了一个IndexError。为什么?我该如何解决?

谢谢!

import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None, n_jobs=1, 
                        train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)   #ylim=y's limit
        plt.xlabel(u"train set size")
        plt.ylabel(u"score")
        plt.gca().invert_yaxis()
        plt.grid()    #网格

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")       # generates a shaded region 
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"train set score")    
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"CV score")

        plt.legend(loc="best")

        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

plot_learning_curve(clf, u"learning_curve", x, y)

完整信息:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-18-0dc3d0934602> in <module>()
     42     return midpoint, diff
     43 
---> 44 plot_learning_curve(clf, u"learning_curve", x, y)

<ipython-input-18-0dc3d0934602> in plot_learning_curve(estimator, title, x, y, ylim, cv, n_jobs, train_sizes, verbose, plot)
      8 
      9     train_sizes, train_scores, test_scores = learning_curve(
---> 10         estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
     11 
     12     train_scores_mean = np.mean(train_scores, axis=1)

D:\Anaconda3\lib\site-packages\sklearn\learning_curve.py in learning_curve(estimator, X, y, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, error_score)
    138     X, y = indexable(X, y)
    139     # Make a list since we will be iterating multiple times over the folds
--> 140     cv = list(check_cv(cv, X, y, classifier=is_classifier(estimator)))
    141     scorer = check_scoring(estimator, scoring=scoring)
    142 

D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in check_cv(cv, X, y, classifier)
   1821         if classifier:
   1822             if type_of_target(y) in ['binary', 'multiclass']:
-> 1823                 cv = StratifiedKFold(y, cv)
   1824             else:
   1825                 cv = KFold(_num_samples(y), cv)

D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py in __init__(self, y, n_folds, shuffle, random_state)
    567         for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
    568             for label, (_, test_split) in zip(unique_labels, per_label_splits):
--> 569                 label_test_folds = test_folds[y == label]
    570                 # the test split can be too big because we used
    571                 # KFold(max(c, self.n_folds), self.n_folds) instead of

IndexError: too many indices for array

1 个答案:

答案 0 :(得分:0)

逻辑回归接受,交叉验证器似乎只接受y值的数组。你似乎传递了一个矩阵

检查区别:

你传递的是:

df.iloc[:,0:1].as_matrix()
array([[0],
       [1],
       [2]], dtype=int64)

但使用

可能更好
df.iloc[:,0].as_matrix()
array([0, 1, 2], dtype=int64)
你能试试吗?