我将数据集手动拆分为3个折叠。在cross_val_score上使用这些折叠效果很好,但在cross_val_predict上使用这些折叠返回:
“ ValueError:cross_val_predict仅适用于分区”
或
“ ValueError:具有多个元素的数组的真值是 暧昧。使用a.any()或a.all()“
取决于我使用的数据结构。
我已经尝试从sklearn 0.17.1切换到0.20.2,这没有帮助。将kfolds的类型转换为numpy.array时,如果kfolds的类型为list
,则返回第一个Exception。我不明白的是,cross_val_score和cross_val_predict的文档说参数“ cv”需要:
- 没有,要使用默认的三折交叉验证,
- 整数,以指定(分层的)KFold中的折叠次数,
- CV分离器
- 可迭代的屈服(训练,测试)分解为索引数组。
列表和numpy.array如何没有可迭代项? 我应该使用什么数据结构?
def custom_folds(file_names, class_len, aug):
# ...
# ...
# ...
'''
here the indices from the whole dataset are retrievd
the directories of the folds only provide the names of the data to use in each fold, since the data in the other directories will be processed differently
'''
for fold in sorted(os.listdir(folds_path)):
training = []
test = []
fold_path = os.path.join(folds_path, fold)
set_list = sorted(os.listdir(fold_path))
for set_type in set_list:
set_path = os.path.join(fold_path, set_type)
for root, dirs, files in os.walk(set_path):
if set_type == 'training':
[training.append(x) for x in files]
if set_type in ['test', 'validation']:
[test.append(x) for x in files]
train_idx = [file_names.index(x) for x in file_names if x in training]
test_idx = [file_names.index(x) for x in file_names if x in test]
folds.append((train_idx, test_idx))
return folds
def cross_val(X, y, estimator, aug):
num_class = len(list(set(y)))
kfold = custom_folds(file_names, num_class, aug)
# cross_val_score works all the time
scores = cross_val_score(estimator, X, y, cv=kfold)
print('mean score: ' + str(np.mean((scores))))
# relative importance
importances = estimator.feature_importances_
print(' importances: ' + str(importances))
return np.mean(scores), importances, kfold
def confuse(X, y, estimator, aug):
accuracy, importances, kfold = cross_val(X, y, estimator, aug)
pred = cross_val_predict(estimator, X, y, cv=kfold)
labels = sorted(list(set(y)))
confusion = np.float32(confusion_matrix(y_true=y, y_pred=pred, labels=labels))