在python中运行此代码时出现KeyError:0:
full_pipeline.fit(X_train, y_train)
这是完整的代码:
from gensim.sklearn_api import D2VTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
name_pipeline = Pipeline( steps = [
( 'feature_selector', FeatureSelector(['name']) ),
( 'feature_transformer', D2VTransformer() ) ] )
description_pipeline = Pipeline( steps = [
( 'feature_selector', FeatureSelector(['description']) ),
( 'feature_transformer', D2VTransformer() ) ] )
X_pipeline = FeatureUnion( transformer_list = [
( 'name_pipeline', name_pipeline ),
( 'description_pipeline', description_pipeline ) ] )
#Split up the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
clf = LogisticRegression(random_state=0, class_weight='balanced', solver='lbfgs', max_iter=1000, multi_class='multinomial')
full_pipeline = Pipeline( steps =
[ ( 'pipeline', X_pipeline),
( 'model', clf ) ] )
full_pipeline.fit(X_train, y_train)
这是我得到的错误:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2896 try:
-> 2897 return self._engine.get_loc(key)
2898 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
19 frames
<ipython-input-14-0ddbaedffb67> in <module>()
25 ( 'model', clf ) ] )
26
---> 27 full_pipeline.fit(X_train, y_train)
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
/usr/local/lib/python3.6/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
910 sum of n_components (output dimension) over transformers.
911 """
--> 912 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
913 if not results:
914 # All transformers are None
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
940 message=self._log_message(name, idx, len(transformers)),
941 **fit_params) for idx, (name, transformer,
--> 942 weight) in enumerate(transformers, 1))
943
944 def transform(self, X):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self, iterable)
1001 # remaining jobs.
1002 self._iterating = False
-> 1003 if self.dispatch_one_batch(iterator):
1004 self._iterating = self._original_iterator is not None
1005
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
832 return False
833 else:
--> 834 self._dispatch(tasks)
835 return True
836
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in _dispatch(self, batch)
751 with self._lock:
752 job_idx = len(self._jobs)
--> 753 job = self._backend.apply_async(batch, callback=cb)
754 # A job can complete so quickly than its callback is
755 # called before we get here, causing self._jobs to
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
199 def apply_async(self, func, callback=None):
200 """Schedule a func to be run"""
--> 201 result = ImmediateResult(func)
202 if callback:
203 callback(result)
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
580 # Don't delay the application, to avoid keeping the input
581 # arguments in memory
--> 582 self.results = batch()
583
584 def get(self):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
/usr/local/lib/python3.6/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
554 else:
555 # fit method of arity 2 (supervised transformation)
--> 556 return self.fit(X, y, **fit_params).transform(X)
557
558
/usr/local/lib/python3.6/dist-packages/gensim/sklearn_api/d2vmodel.py in fit(self, X, y)
158
159 """
--> 160 if isinstance(X[0], doc2vec.TaggedDocument):
161 d2v_sentences = X
162 else:
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
2993 if self.columns.nlevels > 1:
2994 return self._getitem_multilevel(key)
-> 2995 indexer = self.columns.get_loc(key)
2996 if is_integer(indexer):
2997 indexer = [indexer]
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2897 return self._engine.get_loc(key)
2898 except KeyError:
-> 2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
2900 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2901 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
有人知道为什么会这样吗?我认为这与D2VTransformer有关,因为当我运行下面的代码时,我遇到了相同的错误:
model = D2VTransformer(min_count=1, size=5)
docvecs = model.fit_transform(X_train)
但是,当尝试从数据框中仅选择一列时:
docvecs = model.fit_transform(X_train['name'])
它不会引发错误,这就是为什么当我创建管道时,我只使用了一列,但仍然收到错误。
X_train的外观。
name description
9107 way great entrepreneur push limit help succeed way great entrepreneur push limit
7706 dit het team week week dit het team week week
3995 decorate home jewel tone feel bold colour choice inspire fill home abun...
5220 attic meat district attic meat district
3412 tee apparel choose design item clothe accessory piece inde...
... ... ...
3830 marque web designer mode marque web designer
3261 design holiday rest bite try lear magazine dai... design holiday rest bite try lear
2415 hallucinatory house father spirit music room hold tower season rug produce early...
7223 jacket rise jacket rise
4697 cupcake bake explorer love love chocolate cupcake top kind easy foll...
有关X_train的更多详细信息:
X_train.shape
(7159, 2)
X_train.dtypes
name object
description object
dtype: object
答案 0 :(得分:0)
似乎find
中最近有bug和fix(2019年10月,尚未在任何正式版本中)使gensim
更加容忍某些熊猫系列作为数据源,用于解决与您遇到的异常相同的异常。
更改的代码行与扩展错误堆栈中显示的代码行完全相同-D2VTransformer
的第160行,测试d2vmodel.py
。
我建议您获取X[0]
最新版本的原始资源以供本地使用(而不是从d2vmodel.py
导入),并检查是否可以解决您的问题。参见:
https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/sklearn_api/d2vmodel.py