运行Kmeans时如何解决“异常:数据必须为一维”错误

时间:2019-05-03 01:58:58

标签: numpy scikit-learn k-means sklearn-pandas dimension

到目前为止,我已经解决了所有错误。我不确定我是否理解问题,除了出现错误“异常:数据必须为一维”之外。 这是我的代码。这是我正在使用的Excel文件的link

import pandas as pd
import numpy as np
import warnings
from sklearn import preprocessing 
from sklearn.preprocessing import LabelBinarizer
from sklearn.cluster import KMeans
df1 = pd.read_excel('PERM_Disclosure_Data_FY2018_EOYV2.xlsx', 'PERM_FY2018')
warnings.filterwarnings("ignore")
df1 = df1.dropna(subset=['PW_AMOUNT_9089'])
df1 = df1.dropna(subset=['CASE_STATUS'])
df1 = df1.dropna(subset=['PW_SOC_TITLE'])
df1.CASE_STATUS[df1['CASE_STATUS']=='Certified-Expired'] = 'Certified'
df1 = df1[df1.CASE_STATUS != 'Withdrawn']
df1 = df1.dropna()
df1 = df1[df1.PW_AMOUNT_9089 != '#############']
df1 = df1.dropna(subset=['PW_AMOUNT_9089'])
df1 = df1.dropna(subset=['CASE_STATUS'])
df1 = df1.dropna(subset=['PW_SOC_TITLE'])
df1.PW_AMOUNT_9089 = df1.PW_AMOUNT_9089.astype(float)
df1=df1.iloc[:, [2,4,5]]
enc = LabelBinarizer()
y = enc.fit_transform(df1.CASE_STATUS)[:, [0]]

此时y的输出是一个数组:

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [0]])

然后我定义XZ

le = preprocessing.LabelEncoder()
X = df1.iloc[:, [1]]
Z = df1.iloc[:, [2]]
X2 = X.apply(le.fit_transform)
XZ = pd.concat([X2,Z], axis=1)

XZ的输出是:

PW_SOC_TITLE PW_AMOUNT_9089
    12  176 60778.0
    13  456 100901.0
    14  134 134389.0
    15  134 104936.0
    16  134 95160.0
    17  294 66976.0
    18  73  38610.0
    19  598 122533.0
    20  220 109574.0
    21  99  67850.0
    22  399 132018.0
    23  68  56118.0
    24  139 136781.0
    25  134 111405.0
    26  598 58573.0
    27  362 75067.0
    28  598 85862.0
    29  572 33301.0
    30  598 112840.0
    31  134 134971.0
    32  176 100568.0
    33  176 100568.0
    34  626 19614.0
    35  153 26354.0
    36  405 79248.0
    37  220 93350.0
    38  139 153213.0
    39  598 131997.0
    40  598 131997.0
    41  1   90438.0
    ... ... ...
    119741  495 23005.0
    119742  63  46030.0
    119743  153 20301.0
    119744  95  21965.0
    119745  153 29890.0
    119746  295 79680.0
    119747  349 79498.0
    119748  223 38930.0
    119749  223 38930.0
    119750  570 39160.0
    119751  302 119392.0
    119752  598 106001.0
    119753  416 64230.0
    119754  598 115482.0
    119755  99  80205.0
    119756  134 78329.0
    119757  598 109325.0
    119758  598 109325.0
    119759  570 49770.0
    119760  194 18117.0
    119761  404 46987.0
    119762  189 35131.0
    119763  73  49900.0
    119764  323 32240.0
    119765  372 28122.0
    119766  468 67974.0
    119767  399 78520.0
    119768  329 25875.0
    119769  329 25875.0
    119770  601 82098.0

然后我继续:

from sklearn.model_selection import train_test_split
XZ_train, XZ_test, y_train, y_test = train_test_split(XZ, y,
                                                  test_size = .25,
                                                  random_state=20, 
                                                    stratify=y )

# loading library
from pandas_ml import ConfusionMatrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# instantiate learning model loop(k = i)
for weights in ['uniform', 'distance']:
    for i in range(1,11,2):
        knn = KNeighborsClassifier(n_neighbors=i, weights=weights)
        # fitting the model
        knn.fit(XZ_train, y_train)
        # predict the response
        pred = knn.predict(XZ_test)
        confusion = ConfusionMatrix(y_test, pred)
        if i<11:
        # evaluate accuracy
                print('Weight Measure:', knn.weights)
                print('n_neighbors=', knn.n_neighbors)
                print('Accuracy=', accuracy_score(y_test, pred))
                #print('')
                #print('Confusion Matrix')
                #print(confusion)
                print('-----------------------------')

我得到的错误如下:

G:\Anaconda\lib\site-packages\ipykernel_launcher.py:11: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  # This is added back by InteractiveShellApp.init_path()
---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-20-bf6054d911ba> in <module>
     12         # predict the response
     13         pred = knn.predict(XZ_test)
---> 14         confusion = ConfusionMatrix(y_test, pred)
     15         if i<11:
     16         # evaluate accuracy

G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\cm.py in __new__(cls, y_true, y_pred, *args, **kwargs)
     21             if len(set(uniq_true) - set(uniq_pred)) == 0:
     22                 from pandas_ml.confusion_matrix.bcm import BinaryConfusionMatrix
---> 23                 return BinaryConfusionMatrix(y_true, y_pred, *args, **kwargs)
     24         return LabeledConfusionMatrix(y_true, y_pred, *args, **kwargs)
     25 

G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\bcm.py in __init__(self, *args, **kwargs)
     19     def __init__(self, *args, **kwargs):
     20         # super(BinaryConfusionMatrix, self).__init__(y_true, y_pred)
---> 21         super(BinaryConfusionMatrix, self).__init__(*args, **kwargs)
     22         assert self.len() == 2, \
     23             "Binary confusion matrix must have len=2 but \

G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\abstract.py in __init__(self, y_true, y_pred, labels, display_sum, backend, true_name, pred_name)
     31             self._y_true.name = self.true_name
     32         else:
---> 33             self._y_true = pd.Series(y_true, name=self.true_name)
     34 
     35         if isinstance(y_pred, pd.Series):

G:\Anaconda\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
    273             else:
    274                 data = _sanitize_array(data, index, dtype, copy,
--> 275                                        raise_cast_failure=True)
    276 
    277                 data = SingleBlockManager(data, index, fastpath=True)

G:\Anaconda\lib\site-packages\pandas\core\series.py in _sanitize_array(data, index, dtype, copy, raise_cast_failure)
   4163     elif subarr.ndim > 1:
   4164         if isinstance(data, np.ndarray):
-> 4165             raise Exception('Data must be 1-dimensional')
   4166         else:
   4167             subarr = com._asarray_tuplesafe(data, dtype=dtype)

Exception: Data must be 1-dimensional

我传递的数据类型不正确吗?这些数据类型与我在过去的项目中使用的数据类型匹配,所以我认为我可以在这里复制它。对于那些想知道X是我编码的公司名称,Y是二值化案例状态,Z是float dtype中的工资金额的人,

1 个答案:

答案 0 :(得分:1)

“ ... y的输出是一个数组...” 您显示的数组是二维的,形状为(n,1)。 (其中一个维度是微不足道的,但仍然是二维的。)执行诸如y[:, 0]y.ravel()之类的操作以获得一维版本。