使用FeatureUnion拟合管道时的IndexError

时间:2017-06-22 10:26:22

标签: python python-3.x scikit-learn pipeline

我一直在

IndexError: only integers, slices (), ellipsis ( ... ), numpy.newaxis ( {无{1}}

尝试将我的数据框适合以下管道。 Train和Test是具有相同列的两个数据帧。有不同的列,但我只想通过ItemSelector关注其中的三个。

) and integer or boolean arrays are valid indices

完整错误:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import  OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

    class ItemSelector(BaseEstimator, TransformerMixin):

        def __init__(self, column):
            self.column = column

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            return X[self.column]


    def predictCases(train, test):
        target_names = sorted(list(set(train['TARGET'].values)))
        y_train  = np.array([target_names.index(x) for x in train['TARGET'].values])
        y_test   = np.array([target_names.index(x) for x in test['TARGET'].values])

        # train and predict
        classifier = Pipeline([
                    ('union', FeatureUnion([

                            ('text', Pipeline([
                                ('selector', ItemSelector(column='TEXT')),
                                ('tfidf_vec', TfidfVectorizer())
                            ])),

                            ('feature1', Pipeline([
                                ('selector', ItemSelector(column='CATEG_FEAT1')),
                                ('lbe', LabelEncoder())
                            ])),

                            ('feature2', Pipeline([
                                ('selector', ItemSelector(column='CATEG_FEAT2')),
                                ('lbe', LabelEncoder())
                            ]))
                    ])),
                    ('clf', OneVsRestClassifier(LinearSVC()))])
        classifier.fit(train.values, y_train)
        predicted = classifier.predict(test.values)
        return(metrics.precision_recall_fscore_support(y_test, predicted))

编辑:

如果我使用train而不是train.values,我会收到以下错误:

IndexError                                Traceback (most recent call last)
<ipython-input-19-95d9d0c337f4> in <module>()
----> 1 tt = predictCases(train_resampled, validate)

<ipython-input-17-efc951f4192e> in predictCases(train, test)
     24                 ])),
     25                 ('clf', OneVsRestClassifier(LinearSVC()))])
---> 26     classifier.fit(train.values, y_train)
     27     predicted = classifier.predict(test.values)
     28     return(metrics.precision_recall_fscore_support(y_test, predicted))

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    266             This estimator
    267         """
--> 268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
    270             self._final_estimator.fit(Xt, y, **fit_params)

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
    732             delayed(_fit_transform_one)(trans, name, weight, X, y,
    733                                         **fit_params)
--> 734             for name, trans, weight in self._iter())
    735 
    736         if not result:

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
        324         # Don't delay the application, to avoid keeping the input
        325         # arguments in memory
    --> 326         self.results = batch()
        327 
        328     def get(self):

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
        129 
        130     def __call__(self):
    --> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        132 
        133     def __len__(self):

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
        129 
        130     def __call__(self):
    --> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        132 
        133     def __len__(self):

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
        575                        **fit_params):
        576     if hasattr(transformer, 'fit_transform'):
    --> 577         res = transformer.fit_transform(X, y, **fit_params)
        578     else:
        579         res = transformer.fit(X, y, **fit_params).transform(X)

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
        299         """
        300         last_step = self._final_estimator
    --> 301         Xt, fit_params = self._fit(X, y, **fit_params)
        302         if hasattr(last_step, 'fit_transform'):
        303             return last_step.fit_transform(Xt, y, **fit_params)

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
        232                 pass
        233             elif hasattr(transform, "fit_transform"):
    --> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
        235             else:
        236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

    C:\\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
        495         else:
        496             # fit method of arity 2 (supervised transformation)
    --> 497             return self.fit(X, y, **fit_params).transform(X)
        498 
        499 

    <ipython-input-2-fdc42fd9d831> in transform(self, X)
         10 
         11     def transform(self, X):
    ---> 12         return X[self.column]

    IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

1 个答案:

答案 0 :(得分:1)

您将test.values(即带有原始DataFrame值的numpy数组)传递给classifier.predict和classifier.fit,而您的变换器需要一个DataFrame对象。