我有一个相对较小的,不平衡的数据集(~3k数据点,12个类)。我想调整const config = [
{
id: 'obj1',
value: 'value1',
},
{
id: 'obj2',
value: 'value2',
},
{
id: 'obj3',
value: 'value3',
},
{
id: 'obj4',
value: 'value4',
},
]
function init() {
const databaseMap = new Map()
function template(storeConfig) {
const { id } = storeConfig
const storeMap = new Map()
return {
id,
set(key, value) {
console.log(`Setting data to store ${id}.`)
// Do future work here
return storeMap.set(key, value)
},
get(key) {
// Do future work here
return storeMap.get(key)
},
}
}
config.forEach(x => {
const store = template({ id: x.id })
databaseMap.set(x.id, store)
})
return databaseMap
}
const db = init()
const getStore = db.get('obj4')
getStore.set('testing1', 'testing1')
console.log('GET STORE')
console.log(getStore)
console.log('GET TESTING 1')
console.log(getStore.get('testing1'))
的参数并最终测试模型。
目前我这样做,但奇怪的是它在测试集上得分高于训练集(我使用了cohen_kappa_score和准确度)
RandomForestClassifier
结果 - 准确度:
#Split data in training and test set (70/30 stratified split)
x_train, x_test, y_train, y_test = train_test_split(X_Distances, Y, test_size=0.3, random_state=42, stratify=Y)
#Scorings used for parameter tuning evaluation
scoring = {'Accuracy' : make_scorer(accuracy_score), 'Recall' : 'recall_macro', 'Kappa' : make_scorer(cohen_kappa_score)}
#Initializing of parameter ranges
params_randomSearch = {"min_samples_leaf": np.arange(1,30,2),
"min_samples_split": np.arange(2,20,2),
"max_depth": np.arange(2, 20, 2),
"min_weight_fraction_leaf": np.arange(0. ,0.4, 0.1),
"n_estimators": np.arange(10, 1000, 100),
"max_features" : ['auto', 'sqrt', 'log2', None],
"criterion" : ['entropy', 'gini']}
if __name__ == '__main__':
rs = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=params_randomSearch, scoring = scoring, cv = 3, refit = 'Kappa', n_iter=60, n_jobs=-1, random_state=42)
rs.fit(x_train, y_train)
print('Best Score: ', rs.best_score_, '\nBest parameters: ', rs.best_params_)
y_predict = rs.best_estimator_.predict(x_test)
acc = cohen_kappa_score(y_test, y_predict)
best_score_ = {float64} 0.5103216514642342
best_params_ = {dict} {'n_estimators': 310, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 12, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 14, 'criterion': 'entropy'}
# 1. Eval
Accuracy of base (default) classifier on test set: 0.47928331466965285
Accuracy of classifier with best_params of RandomSearchCV on test set: 0.5666293393057111
的结果相同,感兴趣的是我使用' Kappa'来获得完全相同的模型。为cohen_kappa_score
得分
我不知道这是否是可接受的结果,或者我的方法是否有问题。由于数据集很小,测试集可能更容易"?
编辑:refit
数据拆分结果为random_state = 1
:
cohen_kappa