我正在使用逻辑回归将图像分为3类,并且可能还会更多。我的标签列是train_data["label"]
,它由类名称的字符串组成。我想要做的是将所有数据帧传递到包含column_transformer的Pipeline,该Pipeline_transformer还包含OneHotEncoder()来转换标签列。我自己尝试一下,但失败了。
这是我数据框中的一行示例:
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>id</th>
<th>image</th>
<th>label</th>
<th>deep_features</th>
<th>image_array</th></tr> </thead>
<tbody>
<tr>
<th>0</th>
<td>24</td>
<td>Height: 32 Width: 32</td>
<td>bird</td>
<td>[0.242872, 1.09545, 0.0, 0.39363, 0.0, 0.0, 11.8949, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57885, 0.495467, 2.51413, 0.0, 1.51801, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.5814, 0.0, 0.0, 2.59561, 2.70796,
0.0, 0.0, 0.0, 0.85099, 0.0, 0.720349, 0.0, 0.0, 0.0, 0.0, 0.0, 0.270036, 0.0, 0.0, 0.0, 0.0, 0.085928, 0.0, 0.701023, 0.0, 0.0, 0.0, 0.0, 0.0248057, 0.0, 0.0, 0.17549, 0.0, 0.0, 0.0, 0.0, 0.0, 2.39278, 0.0, 0.0, 4.47187, 0.0, 1.63583, 0.0, 4.41748,
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.41179, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]</td>
<td>[73, 77, 58, 71, 68, 50, 77, 69, 44, 120, 116, 83, 125, 120, 90, 155, 148, 117, 147, 152, 106, 133, 136, 92, 112, 101, 75, 100, 84, 68, 100, 78, 74, 69, 49, 51, 22, 16, 13, 35, 27, 24, 52, 40, 40, 70, 55, 54, 117, 97, 89, 122, 101, 99, 103, 85,
88, 88, 74, 73, 68, 59, 52, 72, 65, 52, 82, 73, 60, 84, 72, 68, 84, 67, 73, 77, 56, 65, 93, 75, 81, 117, 102, 98, 174, 171, 146, 183, 210, 149, 180, 214, 143, 185, 225, 144, 73, 74, 52, 75, ...]</td> </tr> </tbody></table>
这是我的代码:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
preprocessor = ColumnTransformer([('Output_Preprocessor', OneHotEncoder(), ['label'])])
pipeline = Pipeline(
steps=[
('Preprocessor', preprocessor),
('Logistic_Regression', LogisticRegression(multi_class='multinomial', solver='lbfgs'))
],
verbose=True
)
X_train = np.array(train_data['image_array'].tolist())
y_train = pd.DataFrame(data=train_data[['label']], index=np.arange(len(train_data[['label']])), columns=["label"])#np.array(train_data[['label']])
pipeline.fit(X_train, y_train)
我跑步时遇到以下错误:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
424 try:
--> 425 all_columns = X.columns
426 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-80-0efd21a8b1dd> in <module>
1 X_train = np.array(train_data['image_array'].tolist())
2 y_train = pd.DataFrame(data=train_data[['label']], index=np.arange(len(train_data[['label']])), columns=["label"])#np.array(train_data[['label']])
----> 3 pipeline.fit(X_train, y_train)
~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
290 cloned_transformer = clone(transformer)
291 # Fit or load from cache the current transformer
--> 292 X, fitted_transformer = fit_transform_one_cached(
293 cloned_transformer, X, y, None,
294 message_clsname='Pipeline',
~/anaconda3/envs/myenv/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
527 self._validate_transformers()
528 self._validate_column_callables(X)
--> 529 self._validate_remainder(X)
530
531 result = self._fit_transform(X, y, _fit_transform_one)
~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in _validate_remainder(self, X)
325 cols = []
326 for columns in self._columns:
--> 327 cols.extend(_get_column_indices(X, columns))
328
329 remaining_idx = sorted(set(range(self._n_features)) - set(cols))
~/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
425 all_columns = X.columns
426 except AttributeError:
--> 427 raise ValueError("Specifying the columns using strings is only "
428 "supported for pandas DataFrames")
429 if isinstance(key, str):
ValueError: Specifying the columns using strings is only supported for pandas DataFrames