Python KeyError:使用D2VTransformer

时间:2019-11-27 10:01:20

标签: python dataframe pipeline gensim doc2vec

在python中运行此代码时出现KeyError:0:

full_pipeline.fit(X_train, y_train)

这是完整的代码:

from gensim.sklearn_api import D2VTransformer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

name_pipeline = Pipeline( steps = [ 
                              ( 'feature_selector', FeatureSelector(['name']) ),
                              ( 'feature_transformer', D2VTransformer() ) ] )

description_pipeline = Pipeline( steps = [ 
                              ( 'feature_selector', FeatureSelector(['description']) ),
                              ( 'feature_transformer', D2VTransformer() ) ] )

X_pipeline = FeatureUnion( transformer_list = [ 
                                                  ( 'name_pipeline', name_pipeline ), 
                                                  ( 'description_pipeline', description_pipeline ) ] )

#Split up the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

clf = LogisticRegression(random_state=0, class_weight='balanced', solver='lbfgs', max_iter=1000, multi_class='multinomial')

full_pipeline = Pipeline( steps = 
                         [ ( 'pipeline', X_pipeline),
                          ( 'model', clf ) ] )

full_pipeline.fit(X_train, y_train)

这是我得到的错误:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
19 frames
<ipython-input-14-0ddbaedffb67> in <module>()
     25                           ( 'model', clf ) ] )
     26 
---> 27 full_pipeline.fit(X_train, y_train)

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    350             This estimator
    351         """
--> 352         Xt, fit_params = self._fit(X, y, **fit_params)
    353         with _print_elapsed_time('Pipeline',
    354                                  self._log_message(len(self.steps) - 1)):

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    315                 message_clsname='Pipeline',
    316                 message=self._log_message(step_idx),
--> 317                 **fit_params_steps[name])
    318             # Replace the transformer of the step with the fitted
    319             # transformer. This is necessary when loading the transformer

/usr/local/lib/python3.6/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    353 
    354     def __call__(self, *args, **kwargs):
--> 355         return self.func(*args, **kwargs)
    356 
    357     def call_and_shelve(self, *args, **kwargs):

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    714     with _print_elapsed_time(message_clsname, message):
    715         if hasattr(transformer, 'fit_transform'):
--> 716             res = transformer.fit_transform(X, y, **fit_params)
    717         else:
    718             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    910             sum of n_components (output dimension) over transformers.
    911         """
--> 912         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    913         if not results:
    914             # All transformers are None

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
    940             message=self._log_message(name, idx, len(transformers)),
    941             **fit_params) for idx, (name, transformer,
--> 942                                     weight) in enumerate(transformers, 1))
    943 
    944     def transform(self, X):

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1001             # remaining jobs.
   1002             self._iterating = False
-> 1003             if self.dispatch_one_batch(iterator):
   1004                 self._iterating = self._original_iterator is not None
   1005 

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    832                 return False
    833             else:
--> 834                 self._dispatch(tasks)
    835                 return True
    836 

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in _dispatch(self, batch)
    751         with self._lock:
    752             job_idx = len(self._jobs)
--> 753             job = self._backend.apply_async(batch, callback=cb)
    754             # A job can complete so quickly than its callback is
    755             # called before we get here, causing self._jobs to

/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    199     def apply_async(self, func, callback=None):
    200         """Schedule a func to be run"""
--> 201         result = ImmediateResult(func)
    202         if callback:
    203             callback(result)

/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
    580         # Don't delay the application, to avoid keeping the input
    581         # arguments in memory
--> 582         self.results = batch()
    583 
    584     def get(self):

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    714     with _print_elapsed_time(message_clsname, message):
    715         if hasattr(transformer, 'fit_transform'):
--> 716             res = transformer.fit_transform(X, y, **fit_params)
    717         else:
    718             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    391                 return Xt
    392             if hasattr(last_step, 'fit_transform'):
--> 393                 return last_step.fit_transform(Xt, y, **fit_params)
    394             else:
    395                 return last_step.fit(Xt, y, **fit_params).transform(Xt)

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    554         else:
    555             # fit method of arity 2 (supervised transformation)
--> 556             return self.fit(X, y, **fit_params).transform(X)
    557 
    558 

/usr/local/lib/python3.6/dist-packages/gensim/sklearn_api/d2vmodel.py in fit(self, X, y)
    158 
    159         """
--> 160         if isinstance(X[0], doc2vec.TaggedDocument):
    161             d2v_sentences = X
    162         else:

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   2993             if self.columns.nlevels > 1:
   2994                 return self._getitem_multilevel(key)
-> 2995             indexer = self.columns.get_loc(key)
   2996             if is_integer(indexer):
   2997                 indexer = [indexer]

/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

有人知道为什么会这样吗?我认为这与D2VTransformer有关,因为当我运行下面的代码时,我遇到了相同的错误:

model = D2VTransformer(min_count=1, size=5)
docvecs = model.fit_transform(X_train) 

但是,当尝试从数据框中仅选择一列时:

docvecs = model.fit_transform(X_train['name']) 

它不会引发错误,这就是为什么当我创建管道时,我只使用了一列,但仍然收到错误。

X_train的外观。

name    description
9107    way great entrepreneur push limit help succeed  way great entrepreneur push limit
7706    dit het team week week  dit het team week week
3995    decorate home jewel tone    feel bold colour choice inspire fill home abun...
5220    attic meat district attic meat district
3412    tee apparel choose design item clothe accessory piece inde...
... ... ...
3830    marque web designer mode    marque web designer
3261    design holiday rest bite try lear magazine dai...   design holiday rest bite try lear
2415    hallucinatory house father spirit   music room hold tower season rug produce early...
7223    jacket rise jacket rise
4697    cupcake bake explorer   love love chocolate cupcake top kind easy foll...

有关X_train的更多详细信息:

X_train.shape
(7159, 2)

X_train.dtypes
name           object
description    object
dtype: object

1 个答案:

答案 0 :(得分:0)

似乎find中最近有bugfix(2019年10月,尚未在任何正式版本中)使gensim更加容忍某些熊猫系列作为数据源,用于解决与您遇到的异常相同的异常。

更改的代码行与扩展错误堆栈中显示的代码行完全相同-D2VTransformer的第160行,测试d2vmodel.py

我建议您获取X[0]最新版本的原始资源以供本地使用(而不是从d2vmodel.py导入),并检查是否可以解决您的问题。参见:

https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/sklearn_api/d2vmodel.py