在Sklearn

时间:2017-06-02 07:16:45

标签: python scikit-learn linear-discriminant

我试图从sklearn实施LinearDiscriminantAnalysis,这是我到目前为止所做的:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
import pandas as pd
# Reading csv file
training_file = 'Training.csv'
testing_file  = 'Test.csv'
dataframe_train = pd.read_csv(training_file)
dataframe_test  = pd.read_csv(testing_file)
dataframe_train['onehot_code'] = dataframe_train.apply(lambda x : onehot_processing(int(float(x['Onehot'])), numberOFclasses), axis=1)
dataframe_test['onehot_code'] = dataframe_test.apply(lambda x:onehot_processing(int(float(x['Onehot'])),numberOFclasses),axis=1)
stdsc = preprocessing.StandardScaler()
np_scaled_train = stdsc.fit_transform(dataframe_train.iloc[:,:-3])
np_scaled_test  = stdsc.transform(dataframe_test.iloc[:,:-3])

lda = LinearDiscriminantAnalysis(n_components=2)
Training_Frame = lda.fit_transform(np_scaled_train,dataframe_train.iloc[:,-1])  # the script crashes here 

Testing_Frame  = lda.transform(np_scaled_test)

我得到的错误信息是:

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead.

数据框的形状是正确的,所以我不知道我错过了什么,或者我应该转换什么,以便函数接受参数,或者是其他原因?

我会感激任何暗示!

更新

以下是dataframe_train.iloc[:,-1]的样子:

    0       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
6       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
10      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
11      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
12      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
13      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
14      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
15      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
16      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
17      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
18      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
19      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
20      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
21      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
22      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
23      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
24      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
25      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
26      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
27      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
28      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
29      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
2328    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2329    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2330    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2331    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2332    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2333    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2334    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2335    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2336    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2337    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2338    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2339    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2340    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2341    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2342    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2343    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2344    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2345    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2346    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2347    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2348    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2349    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2350    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2351    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2352    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2353    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2354    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2355    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2356    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2357    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: onehot_code, dtype: object

每行是20个元素的向量。

** 2nd_UPDATE"

运行以下内容:Training_Frame = lda.fit_transform(np_scaled_train,np.asarray(dataframe_train.iloc[:,-1]))

发送此错误消息:

        ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
    <ipython-input-7-a8adf693ad9e> in <module>()
    ----> 1 Training_Frame = lda.fit_transform(np_scaled_train,np.asarray(dataframe_train.iloc[:,-1]))

    c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
        495         else:
        496             # fit method of arity 2 (supervised transformation)
    --> 497             return self.fit(X, y, **fit_params).transform(X)
        498 
        499 

    c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\discriminant_analysis.py in fit(self, X, y, store_covariance, tol)
        441             self.tol = tol
        442         X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self)
    --> 443         self.classes_ = unique_labels(y)
        444 
        445         if self.priors is None:  # estimate priors from sample

    c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\utils\multiclass.py in unique_labels(*ys)
         77     # Check that we don't mix label format
         78 
    ---> 79     ys_types = set(type_of_target(x) for x in ys)
         80     if ys_types == set(["binary", "multiclass"]):
         81         ys_types = set(["multiclass"])

    c:\users\engine\appdata\local\programs\python\python35\lib\site-packages

\sklearn\utils\multiclass.py in <genexpr>(.0)
     77     # Check that we don't mix label format
     78 
---> 79     ys_types = set(type_of_target(x) for x in ys)
     80     if ys_types == set(["binary", "multiclass"]):
     81         ys_types = set(["multiclass"])

c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
    248         if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence)
    249                 and not isinstance(y[0], string_types)):
--> 250             raise ValueError('You appear to be using a legacy multi-label data'
    251                              ' representation. Sequence of sequences are no'
    252                              ' longer supported; use a binary array or sparse'

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead.

1 个答案:

答案 0 :(得分:0)

当试图重复你的例子时,这对我有用。

y_train = dataframe_train.iloc[:,-1]
y_test = dataframe_test.iloc[:,-1]

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train  = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

lda = LinearDiscriminantAnalysis(n_components=2)
Training_Frame = lda.fit_transform(np_scaled_train, y_train)  

Testing_Frame  = lda.transform(np_scaled_test)

错误很可能是由于pandas如何处理列中的列表,以及numpy如何解释它们。 scikit-learn检查是否提供y是一个支持类型(dtypes)(int,float,string等)的numpy数组,但在你的情况下df.iloc[:, -1]返回一个pandas.Series,当它直接转换为numpy时,结果在dtype = object。因而错误。

另外一个解决方法是(不使用上面的任何代码):

Training_Frame = lda.fit_transform(np_scaled_train, 
                     np.array([np.array(r) for r in dataframe_train‌​.iloc[:,-1]]))

希望它适合你。