我无法将KS测试应用于数据集中产生的每个算法。在这种情况下,我尝试使用基于分类的算法来预测二进制类别default.payment.next.month。我已经阅读了很多有关如何在Python中计算KS值的示例,但是在我的情况下,所有这些示例都不起作用。我已经使用一些指标来测试算法,但是无法计算信用卡数据集中的KS值。
credit_data = pd.read_csv('../Desktop/Data path/Credit_data.csv')
_y_target = credit_data['default.payment.next.month'].values
columns = credit_data.columns.tolist()
columns.remove('default.payment.next.month')
_x_attributes = credit_data[columns].values
_x_train,_x_test,_y_train, _y_test = train_test_split(_x_attributes,
_y_target, test_size =0.30, stratify = _y_target, random_state = 1)
print("label counts in y train %s" %bincount(_y_train))
print("label counts in y test %s" %bincount(_y_test))
def plotLearningCurve(_x_train, _y_train, learning_model_pipeline,
k_fold = 10, training_sample_sizes = linspace(0.1,1.0,10),
jobsInParallel = -1):
training_size, training_score, testing_score = learning_curve(estimator = learning_model_pipeline,
X = _x_train,
y = _y_train,
train_sizes = training_sample_sizes,
cv = k_fold,
n_jobs = jobsInParallel)
training_mean = mean(training_score, axis = 1)
training_std_deviation = std(training_score, axis = 1)
testing_std_deviation = std(testing_score, axis = 1)
testing_mean = mean(testing_score, axis = 1 )
plot.plot(training_size, training_mean, label= "Training Data",
marker= '+', color = 'blue', markersize = 8)
plot.fill_between(training_size, training_mean+
training_std_deviation, training_mean-training_std_deviation,
color='blue', alpha =0.12 )
plot.plot(training_size, testing_mean, label= "Testing/Validation Data", marker= '*', color = 'green', markersize = 8)
plot.fill_between(training_size, testing_mean+
training_std_deviation, testing_mean-training_std_deviation,
color='green', alpha =0.14)
plot.title("Scoring of our training and testing data vs sample sizes")
plot.xlabel("Number of Samples")
plot.ylabel("Accuracy")
plot.legend(loc= 'best')
plot.show()
def runGridSearchAndPredict(pipeline, x_train, y_train, x_test,
y_test, param_grid, n_jobs = -1, cv = 10, score = 'accuracy'):
response = {}
training_timer = CodeTimer('training')
testing_timer = CodeTimer('testing')
learning_curve_timer = CodeTimer('learning_curve')
predict_proba_timer = CodeTimer('predict_proba')
with training_timer:
gridsearch = GridSearchCV(estimator= ipeline, param_grid=param_grid, cv=cv, n_jobs=n_jobs, scoring=score)
search = gridsearch.fit(x_train,y_train)
print("Grid Search Best parameters ", search.best_params_)
print("Grid Search Best score ", search.best_score_)
with testing_timer:
y_prediction = gridsearch.predict(x_test)
print("Accuracy score %s" %accuracy_score(y_test,y_prediction))
print("precision score %s" %precision_score(y_test,y_prediction))
print("recall score %s" %recall_score(y_test,y_prediction))
print("auc score %s" %roc_auc_score(y_test,y_prediction))
print("F1 score %s" %f1_score(y_test,y_prediction))
print("Gini score %s" %gini_score(y_test,y_prediction))
print("Ks score %s" %ks_score(y_prediction))
print("Classification report \n %s" %(classification_report(y_test, y_prediction)))
with learning_curve_timer:
plotLearningCurve(_x_train, _y_train, search.best_estimator_)
with predict_proba_timer:
if hasattr(gridsearch.best_estimator_, 'predict_proba'):
y_probability = gridsearch.predict_proba(x_test)
false_positive_rate, true_positive_rate, thresholds =
roc_curve(y_test, y_probability[:,1])
response['roc_auc_score'] = roc_auc_score(y_test,
y_probability[:,1])
response['roc_curve'] = (false_positive_rate,
true_positive_rate)
else: ## eg SVM, Perceptron doesnt have predict_proba method
response['roc_auc_score'] = 0
response['roc_curve'] = None
response['learning_curve_time'] = learning_curve_timer.took
response['testing_time'] = testing_timer.took
response['_y_prediction'] = y_prediction
response['accuracy_score'] = accuracy_score(y_test,y_prediction)
response['precision_score'] = precision_score(y_test,y_prediction)
response['recall_score'] = recall_score(y_test,y_prediction)
response['auc_score'] = roc_auc_score(y_test,y_prediction)
response['training_time'] = training_timer.took
response['f1_score'] = f1_score(y_test, y_prediction)
response['gini_score'] = gini_score(y_test,y_prediction)
response['ks_score'] = ks_score(y_prediction)
return response
我已经尝试过了:
def ks_score(y_prediction):
Good = y_prediction.value(0)
Bad = y_prediction.value(1)
fpr,tpr,thresholds= roc_curve(Good,Bad)
ks = max(tpr-fpr)
return ks`
def ks_score(y_prediction):
ks=ks_2samp(y_prediction['y_prediction.default.payment.next.month'==0],
y_prediction['y_prediction.default.payment.next.month']==1)
return ks
我尝试过的所有方法似乎都无效,我只想计算KS
在这种情况下,每种算法的取值。