由于我第一次使用python进行数据挖掘,我面临着调整参数和获得最佳参数值(cutoff,classwt,sampsize)的问题。我试图在scikit中使用随机森林找到不同类别的截止值。我使用以下代码
def cutoff_predict(rf,trainArr,cutoff):
return (rf.predict_prob(trainArr)[:,1]>cutoff).astype(int)
score=[]
def custom_f1(cutoff):
def f1_cutoff(rf,trainArr,y):
ypred=cutoff_predict(rf,trainArr,cutoff)
return sklearn.metrics.f1_score(Actualres,results)
return f1_cutoff
for cutoff in np.arange(0.1,0.9,0.1):
rf = RandomForestClassifier(n_estimators=100) #Random forest generation for Classification
rf.fit(trainArr, trainRes) #Fit the random forest model
validated=cross_val_score(rf,trainArr,trainRes,cv=10,scoring=custom_f1(cutoff))
score.append(validated)
但是我收到了以下错误。
IndexError Traceback (most recent call last)
<ipython-input-14-f8b808ce9a4d> in <module>()
94 rf = RandomForestClassifier(n_estimators=100) #Random forest generation for Classification
95 rf.fit(trainArr, trainRes) #Fit the random forest model
---> 96 validated=cross_val_score(rf,trainArr,trainRes,cv=10,scoring=custom_f1(cutoff))
97 score.append(validated)
C:\Python27\Anaconda\lib\site-packages\sklearn\cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1350 X, y = indexable(X, y)
1351
-> 1352 cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
1353 scorer = check_scoring(estimator, scoring=scoring)
1354 # We clone the estimator to make sure that all the folds are
C:\Python27\Anaconda\lib\site-packages\sklearn\cross_validation.pyc in _check_cv(cv, X, y, classifier, warn_mask)
1604 if classifier:
1605 if type_of_target(y) in ['binary', 'multiclass']:
-> 1606 cv = StratifiedKFold(y, cv, indices=needs_indices)
1607 else:
1608 cv = KFold(_num_samples(y), cv, indices=needs_indices)
C:\Python27\Anaconda\lib\site-packages\sklearn\cross_validation.pyc in __init__(self, y, n_folds, indices, shuffle, random_state)
432 for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
433 for label, (_, test_split) in zip(unique_labels, per_label_splits):
--> 434 label_test_folds = test_folds[y == label]
435 # the test split can be too big because we used
436 # KFold(max(c, self.n_folds), self.n_folds) instead of
IndexError: too many indices for array
这可能是什么问题?另外:在&#39; R&#39;我们可以选择调整&#39; cutoff&#39;参数(cutoff = 1 /(类数))。在随机森林(scikit学习包)中是否有类似的参数来调整python?