如何在sklearn的输出列抛出管道上应用OneHotEncoder?

时间:2020-11-10 03:11:15

标签: python pandas numpy dataframe scikit-learn

我正在使用逻辑回归将图像分为3类,并且可能还会更多。我的标签列是train_data["label"],它由类名称的字符串组成。我想要做的是将所有数据帧传递到包含column_transformer的Pipeline,该Pipeline_transformer还包含OneHotEncoder()来转换标签列。我自己尝试一下,但失败了。

这是我数据框中的一行示例:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>id</th>
      <th>image</th>
      <th>label</th>
      <th>deep_features</th>
      <th>image_array</th></tr> </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>24</td>
      <td>Height: 32 Width: 32</td>
      <td>bird</td>
      <td>[0.242872, 1.09545, 0.0, 0.39363, 0.0, 0.0, 11.8949, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57885, 0.495467, 2.51413, 0.0, 1.51801, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.5814, 0.0, 0.0, 2.59561, 2.70796,
        0.0, 0.0, 0.0, 0.85099, 0.0, 0.720349, 0.0, 0.0, 0.0, 0.0, 0.0, 0.270036, 0.0, 0.0, 0.0, 0.0, 0.085928, 0.0, 0.701023, 0.0, 0.0, 0.0, 0.0, 0.0248057, 0.0, 0.0, 0.17549, 0.0, 0.0, 0.0, 0.0, 0.0, 2.39278, 0.0, 0.0, 4.47187, 0.0, 1.63583, 0.0, 4.41748,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.41179, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]</td>
      <td>[73, 77, 58, 71, 68, 50, 77, 69, 44, 120, 116, 83, 125, 120, 90, 155, 148, 117, 147, 152, 106, 133, 136, 92, 112, 101, 75, 100, 84, 68, 100, 78, 74, 69, 49, 51, 22, 16, 13, 35, 27, 24, 52, 40, 40, 70, 55, 54, 117, 97, 89, 122, 101, 99, 103, 85,
        88, 88, 74, 73, 68, 59, 52, 72, 65, 52, 82, 73, 60, 84, 72, 68, 84, 67, 73, 77, 56, 65, 93, 75, 81, 117, 102, 98, 174, 171, 146, 183, 210, 149, 180, 214, 143, 185, 225, 144, 73, 74, 52, 75, ...]</td> </tr> </tbody></table>

这是我的代码:

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

preprocessor = ColumnTransformer([('Output_Preprocessor', OneHotEncoder(), ['label'])])
pipeline = Pipeline(
    steps=[
        ('Preprocessor', preprocessor),
        ('Logistic_Regression', LogisticRegression(multi_class='multinomial', solver='lbfgs'))
    ],
    verbose=True
)

X_train = np.array(train_data['image_array'].tolist())
y_train = pd.DataFrame(data=train_data[['label']], index=np.arange(len(train_data[['label']])), columns=["label"])#np.array(train_data[['label']])
pipeline.fit(X_train, y_train)

我跑步时遇到以下错误:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
    424         try:
--> 425             all_columns = X.columns
    426         except AttributeError:

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-80-0efd21a8b1dd> in <module>
      1 X_train = np.array(train_data['image_array'].tolist())
      2 y_train = pd.DataFrame(data=train_data[['label']], index=np.arange(len(train_data[['label']])), columns=["label"])#np.array(train_data[['label']])
----> 3 pipeline.fit(X_train, y_train)

~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    328         """
    329         fit_params_steps = self._check_fit_params(**fit_params)
--> 330         Xt = self._fit(X, y, **fit_params_steps)
    331         with _print_elapsed_time('Pipeline',
    332                                  self._log_message(len(self.steps) - 1)):

~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    290                 cloned_transformer = clone(transformer)
    291             # Fit or load from cache the current transformer
--> 292             X, fitted_transformer = fit_transform_one_cached(
    293                 cloned_transformer, X, y, None,
    294                 message_clsname='Pipeline',

~/anaconda3/envs/myenv/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    738     with _print_elapsed_time(message_clsname, message):
    739         if hasattr(transformer, 'fit_transform'):
--> 740             res = transformer.fit_transform(X, y, **fit_params)
    741         else:
    742             res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    527         self._validate_transformers()
    528         self._validate_column_callables(X)
--> 529         self._validate_remainder(X)
    530 
    531         result = self._fit_transform(X, y, _fit_transform_one)

~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in _validate_remainder(self, X)
    325         cols = []
    326         for columns in self._columns:
--> 327             cols.extend(_get_column_indices(X, columns))
    328 
    329         remaining_idx = sorted(set(range(self._n_features)) - set(cols))

~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
    425             all_columns = X.columns
    426         except AttributeError:
--> 427             raise ValueError("Specifying the columns using strings is only "
    428                              "supported for pandas DataFrames")
    429         if isinstance(key, str):

ValueError: Specifying the columns using strings is only supported for pandas DataFrames


0 个答案:

没有答案