如何在Python中应用KS测试

时间:2019-09-15 20:17:27

标签: python credit-card-track-data

我无法将KS测试应用于数据集中产生的每个算法。在这种情况下,我尝试使用基于分类的算法来预测二进制类别default.payment.next.month。我已经阅读了很多有关如何在Python中计算KS值的示例,但是在我的情况下,所有这些示例都不起作用。我已经使用一些指标来测试算法,但是无法计算信用卡数据集中的KS值。

代码:

credit_data = pd.read_csv('../Desktop/Data path/Credit_data.csv')

_y_target = credit_data['default.payment.next.month'].values
columns = credit_data.columns.tolist()
columns.remove('default.payment.next.month')
_x_attributes = credit_data[columns].values


_x_train,_x_test,_y_train, _y_test = train_test_split(_x_attributes,
                                                      _y_target, test_size =0.30, stratify = _y_target, random_state = 1)

print("label counts in y train %s" %bincount(_y_train))
print("label counts in y test %s" %bincount(_y_test))

def plotLearningCurve(_x_train, _y_train, learning_model_pipeline,
                      k_fold = 10, training_sample_sizes = linspace(0.1,1.0,10), 
                      jobsInParallel = -1):

    training_size, training_score, testing_score =  learning_curve(estimator = learning_model_pipeline,
                                                                   X = _x_train,
                                                                   y = _y_train,
                                                                   train_sizes = training_sample_sizes,
                                                                   cv = k_fold,
                                                                   n_jobs = jobsInParallel)


    training_mean = mean(training_score, axis = 1)
    training_std_deviation = std(training_score, axis = 1)
    testing_std_deviation = std(testing_score, axis = 1)
    testing_mean = mean(testing_score, axis = 1 )

    plot.plot(training_size, training_mean, label= "Training Data", 
              marker= '+', color = 'blue', markersize = 8)
    plot.fill_between(training_size, training_mean+ 
                      training_std_deviation, training_mean-training_std_deviation, 
                      color='blue', alpha =0.12 )

    plot.plot(training_size, testing_mean, label= "Testing/Validation Data", marker= '*', color = 'green', markersize = 8)
    plot.fill_between(training_size, testing_mean+ 
                      training_std_deviation, testing_mean-training_std_deviation, 
                      color='green', alpha =0.14)

    plot.title("Scoring of our training and testing data vs sample sizes")
    plot.xlabel("Number of Samples")
    plot.ylabel("Accuracy")
    plot.legend(loc= 'best')
    plot.show()

def runGridSearchAndPredict(pipeline, x_train, y_train, x_test, 
                            y_test, param_grid, n_jobs = -1, cv = 10, score = 'accuracy'):

    response = {}
    training_timer = CodeTimer('training')
    testing_timer = CodeTimer('testing')
    learning_curve_timer = CodeTimer('learning_curve')
    predict_proba_timer  = CodeTimer('predict_proba')

    with training_timer:
        gridsearch = GridSearchCV(estimator= ipeline, param_grid=param_grid, cv=cv, n_jobs=n_jobs, scoring=score)
        search = gridsearch.fit(x_train,y_train)
        print("Grid Search Best parameters ", search.best_params_)
        print("Grid Search Best score ", search.best_score_)

    with testing_timer:
        y_prediction = gridsearch.predict(x_test)

        print("Accuracy score %s" %accuracy_score(y_test,y_prediction))
        print("precision score %s" %precision_score(y_test,y_prediction))
        print("recall score %s" %recall_score(y_test,y_prediction))
        print("auc score %s" %roc_auc_score(y_test,y_prediction))
        print("F1 score %s" %f1_score(y_test,y_prediction))
        print("Gini score %s" %gini_score(y_test,y_prediction))
        print("Ks score %s" %ks_score(y_prediction))
        print("Classification report  \n %s" %(classification_report(y_test,  y_prediction)))

    with learning_curve_timer:
        plotLearningCurve(_x_train, _y_train, search.best_estimator_)

    with predict_proba_timer:
        if hasattr(gridsearch.best_estimator_, 'predict_proba'):

            y_probability = gridsearch.predict_proba(x_test)
            false_positive_rate, true_positive_rate, thresholds =         
            roc_curve(y_test, y_probability[:,1])
            response['roc_auc_score'] = roc_auc_score(y_test, 
            y_probability[:,1])
            response['roc_curve'] = (false_positive_rate, 
            true_positive_rate)

    else: ## eg SVM, Perceptron doesnt have predict_proba method

        response['roc_auc_score'] = 0
        response['roc_curve'] = None

        response['learning_curve_time'] = learning_curve_timer.took
        response['testing_time'] = testing_timer.took
        response['_y_prediction'] = y_prediction
        response['accuracy_score'] = accuracy_score(y_test,y_prediction)
        response['precision_score'] = precision_score(y_test,y_prediction)
        response['recall_score'] = recall_score(y_test,y_prediction)
        response['auc_score'] = roc_auc_score(y_test,y_prediction)
        response['training_time'] = training_timer.took
        response['f1_score']  = f1_score(y_test, y_prediction)
        response['gini_score'] = gini_score(y_test,y_prediction)
        response['ks_score'] = ks_score(y_prediction)

        return response

我已经尝试过了:

def ks_score(y_prediction):
    Good = y_prediction.value(0)
    Bad =  y_prediction.value(1)
    fpr,tpr,thresholds= roc_curve(Good,Bad)
    ks = max(tpr-fpr)
    return ks`


def ks_score(y_prediction):
    ks=ks_2samp(y_prediction['y_prediction.default.payment.next.month'==0],
                y_prediction['y_prediction.default.payment.next.month']==1)
    return ks  

我尝试过的所有方法似乎都无效,我只想计算KS
    在这种情况下,每种算法的取值。

0 个答案:

没有答案