scikit-learn FeatureUnion无法将文本和数字功能组合在一起

时间:2018-11-16 21:07:26

标签: python scikit-learn

我正在尝试将数据集中的电影情节的文本列与每个电影的评分(MPAA评分-G,PG,PG-13,R;不是IMDb用户的评分)的分类列结合起来。我正在使用sklearn的FeatureUnion对象,但是由于使用太多命名参数调用fit_transform方法,我不断收到错误消息。这是我的代码:

# create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(movie_ratings[['Genre', 'Plot']], pd.get_dummies(movie_ratings['Rated']), random_state=56)

''' create a processing pipeline and feature union '''
# create function transformers
get_genre_data = FunctionTransformer(lambda x: x['Genre'], validate=False)
get_plot_data = FunctionTransformer(lambda x: x['Plot'], validate=False)

# obtain the data
genres = get_genre_data.fit_transform(movie_ratings)
plots = get_plot_data.fit_transform(movie_ratings)

# # join the processing in a feature union
join_data_formats = FeatureUnion(
    transformer_list = [
        ('genres', Pipeline([
            ('selector', get_genre_data),
            ('one_hot_encoder', LabelEncoder())
        ])),
        ('plots', Pipeline([
            ('selector', get_plot_data),
            ('count_vectorizer', CountVectorizer(tokenizer=nltk.tokenize)),
            ('tfidf_transformer', TfidfTransformer())
        ]))
    ]
)

# # instantiate a nested pipeline
pipeline = Pipeline([
    ('feature_union', join_data_formats),
    ('neural_network', MLPClassifier(alpha=0.01, hidden_layer_sizes=(100,), early_stopping=False, verbose=True))
])

# # fit the pipeline to the training data
pipeline.fit(X_train, y_train)

...并且抛出的错误是:

     34 # # fit the pipeline to the training data
---> 35 pipeline.fit(X_train, y_train)

...

TypeError: fit_transform() takes 2 positional arguments but 3 were given

我要去哪里错了?非常感谢您的帮助!

更新:这是完整的堆栈跟踪:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-171-f57d9b24a9c8> in <module>()
     28 # print(y_test.shape)
     29 
---> 30 pipeline.fit(X_train, y_train)
     31 y_pred = pipeline.predict(X_test)
     32 

~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    246             This estimator
    247         """
--> 248         Xt, fit_params = self._fit(X, y, **fit_params)
    249         if self._final_estimator is not None:
    250             self._final_estimator.fit(Xt, y, **fit_params)

~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
    211                 Xt, fitted_transformer = fit_transform_one_cached(
    212                     cloned_transformer, None, Xt, y,
--> 213                     **fit_params_steps[name])
    214                 # Replace the transformer of the step with the fitted
    215                 # transformer. This is necessary when loading the transformer

~\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
    360 
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
    363 
    364     def call_and_shelve(self, *args, **kwargs):

~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    579                        **fit_params):
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:
    583         res = transformer.fit(X, y, **fit_params).transform(X)

~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
    737             delayed(_fit_transform_one)(trans, weight, X, y,
    738                                         **fit_params)
--> 739             for name, trans, weight in self._iter())
    740 
    741         if not result:

~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    579                        **fit_params):
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:
    583         res = transformer.fit(X, y, **fit_params).transform(X)

~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
    281         Xt, fit_params = self._fit(X, y, **fit_params)
    282         if hasattr(last_step, 'fit_transform'):
--> 283             return last_step.fit_transform(Xt, y, **fit_params)
    284         elif last_step is None:
    285             return Xt

TypeError: fit_transform() takes 2 positional arguments but 3 were given

0 个答案:

没有答案