我试图从sklearn实施LinearDiscriminantAnalysis
,这是我到目前为止所做的:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
import pandas as pd
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
dataframe_train = pd.read_csv(training_file)
dataframe_test = pd.read_csv(testing_file)
dataframe_train['onehot_code'] = dataframe_train.apply(lambda x : onehot_processing(int(float(x['Onehot'])), numberOFclasses), axis=1)
dataframe_test['onehot_code'] = dataframe_test.apply(lambda x:onehot_processing(int(float(x['Onehot'])),numberOFclasses),axis=1)
stdsc = preprocessing.StandardScaler()
np_scaled_train = stdsc.fit_transform(dataframe_train.iloc[:,:-3])
np_scaled_test = stdsc.transform(dataframe_test.iloc[:,:-3])
lda = LinearDiscriminantAnalysis(n_components=2)
Training_Frame = lda.fit_transform(np_scaled_train,dataframe_train.iloc[:,-1]) # the script crashes here
Testing_Frame = lda.transform(np_scaled_test)
我得到的错误信息是:
ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead.
数据框的形状是正确的,所以我不知道我错过了什么,或者我应该转换什么,以便函数接受参数,或者是其他原因?
我会感激任何暗示!
更新
以下是dataframe_train.iloc[:,-1]
的样子:
0 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
6 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
10 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
11 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
12 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
13 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
14 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
15 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
16 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
17 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
18 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
19 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
20 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
21 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
22 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
23 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
24 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
25 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
26 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
27 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
28 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
29 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
...
2328 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2329 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2330 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2331 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2332 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2333 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2334 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2335 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2336 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2337 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2338 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2339 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2340 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2341 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2342 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2343 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2344 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2345 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2346 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2347 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2348 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2349 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2350 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2351 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2352 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2353 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2354 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2355 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2356 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2357 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: onehot_code, dtype: object
每行是20个元素的向量。
** 2nd_UPDATE"
运行以下内容:Training_Frame = lda.fit_transform(np_scaled_train,np.asarray(dataframe_train.iloc[:,-1]))
发送此错误消息:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-a8adf693ad9e> in <module>()
----> 1 Training_Frame = lda.fit_transform(np_scaled_train,np.asarray(dataframe_train.iloc[:,-1]))
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\discriminant_analysis.py in fit(self, X, y, store_covariance, tol)
441 self.tol = tol
442 X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self)
--> 443 self.classes_ = unique_labels(y)
444
445 if self.priors is None: # estimate priors from sample
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\utils\multiclass.py in unique_labels(*ys)
77 # Check that we don't mix label format
78
---> 79 ys_types = set(type_of_target(x) for x in ys)
80 if ys_types == set(["binary", "multiclass"]):
81 ys_types = set(["multiclass"])
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages
\sklearn\utils\multiclass.py in <genexpr>(.0)
77 # Check that we don't mix label format
78
---> 79 ys_types = set(type_of_target(x) for x in ys)
80 if ys_types == set(["binary", "multiclass"]):
81 ys_types = set(["multiclass"])
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
248 if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence)
249 and not isinstance(y[0], string_types)):
--> 250 raise ValueError('You appear to be using a legacy multi-label data'
251 ' representation. Sequence of sequences are no'
252 ' longer supported; use a binary array or sparse'
ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead.
答案 0 :(得分:0)
当试图重复你的例子时,这对我有用。
y_train = dataframe_train.iloc[:,-1]
y_test = dataframe_test.iloc[:,-1]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)
lda = LinearDiscriminantAnalysis(n_components=2)
Training_Frame = lda.fit_transform(np_scaled_train, y_train)
Testing_Frame = lda.transform(np_scaled_test)
错误很可能是由于pandas如何处理列中的列表,以及numpy如何解释它们。 scikit-learn检查是否提供y是一个支持类型(dtypes)(int,float,string等)的numpy数组,但在你的情况下df.iloc[:, -1]
返回一个pandas.Series,当它直接转换为numpy时,结果在dtype = object
。因而错误。
另外一个解决方法是(不使用上面的任何代码):
Training_Frame = lda.fit_transform(np_scaled_train,
np.array([np.array(r) for r in dataframe_train.iloc[:,-1]]))
希望它适合你。