我收到此错误 - “ValueError:未知标签类型:'未知'”
我搜索了网但无法摆脱这个错误,我是python btw的新手:)
我的数据有5行22列,最后一列是Label(True,False)
dataset = pandas.read_csv(path) #Dataframe created
数据如下所示:
dataset.head()
loc v(g) ev(g) iv(g) n v l d i e ... lOCode lOComment lOBlank locCodeAndComment uniq_Op uniq_Opnd total_Op total_Opnd branchCount defects
0 1.0 1.0 1.0 1.0 1.0 1.00 1.0 1.0 1.00 1.00 ... 1 1 1 1 1.0 1.0 1.0 1.0 1.0 True
1 1.1 1.4 1.4 1.4 1.3 1.30 1.3 1.3 1.30 1.30 ... 2 2 2 1 1.2 1.2 1.2 1.2 1.4 False
2 2.0 1.0 1.0 1.0 1.0 0.00 0.0 0.0 0.00 0.00 ... 0 0 1 0 1.0 0.0 1.0 0.0 1.0 False
3 2.0 1.0 1.0 1.0 1.0 0.00 0.0 0.0 0.00 0.00 ... 0 0 1 0 1.0 0.0 1.0 0.0 1.0 False
4 3.0 1.0 1.0 1.0 22.0 85.95 0.2 5.0 17.19 429.76 ... 1 0 3 0 10.0 5.0 17.0 5.0 1.0 False
5 rows × 22 columns
其余代码:
array = dataset.values
X = array[:,0:21] # Row=ALL, Col=1 to 21 (index=0to20)
Y = array[:,21] # Row=ALL, Col=22nd (index=21)
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.20, random_state=0) #80% Training data , 20% Test data
kfold = model_selection.KFold(n_splits=10, random_state=0)
cv_results = []
我在以下一行收到错误:
cv_results = model_selection.cross_val_score(SVC(), X_train, Y_train, cv=kfold, scoring='accuracy')
详细错误:
ValueError Traceback (most recent call last)
<ipython-input-31-e1234a2bbe9b> in <module>()
----> 1 cv_results = model_selection.cross_val_score(SVC(), X_train, Y_train, cv=kfold, scoring='accuracy')
C:\Program Files\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.pyc in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
138 train, test, verbose, None,
139 fit_params)
--> 140 for train, test in cv_iter)
141 return np.array(scores)[:, 0]
142
C:\Program Files\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
C:\Program Files\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
C:\Program Files\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
C:\Program Files\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.pyc in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
C:\Program Files\Anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.pyc in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
C:\Program Files\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\Program Files\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
236 estimator.fit(X_train, **fit_params)
237 else:
--> 238 estimator.fit(X_train, y_train, **fit_params)
239
240 except Exception as e:
C:\Program Files\Anaconda2\lib\site-packages\sklearn\svm\base.pyc in fit(self, X, y, sample_weight)
150
151 X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
--> 152 y = self._validate_targets(y)
153
154 sample_weight = np.asarray([]
C:\Program Files\Anaconda2\lib\site-packages\sklearn\svm\base.pyc in _validate_targets(self, y)
518 def _validate_targets(self, y):
519 y_ = column_or_1d(y, warn=True)
--> 520 check_classification_targets(y)
521 cls, y = np.unique(y_, return_inverse=True)
522 self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)
C:\Program Files\Anaconda2\lib\site-packages\sklearn\utils\multiclass.pyc in check_classification_targets(y)
170 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
171 'multilabel-indicator', 'multilabel-sequences']:
--> 172 raise ValueError("Unknown label type: %r" % y_type)
173
174
ValueError: Unknown label type: 'unknown'
答案 0 :(得分:2)
第1部分
您获得的错误与您使用的y变量相关。
您需要将TRUE/FALSE
转换为0/1
,以便Y变量包含0s
和1s
。这应该可以解决错误。
来自文档see here:
y : array-like, shape (n_samples,)
Target values (class labels in classification, real numbers in regression)
第2部分
接下来,您应该使用交叉验证,它会自动将数据拆分为X_train,X_test
和y_train, y_test
或train_test_split
函数,然后手动执行以下操作:
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
...
另一方面,如果您想cross validation
使用KFold
,请使用:
kfold = model_selection.KFold(n_splits=10, random_state=0)
cv_results = model_selection.cross_val_score(SVC(), X, Y, cv=kfold, scoring='accuracy')
这将自动创建X_train
,X_test
和y_train
,y_test
,它会为您提供cv_results
。