Question

我目前正在研究一个读取结构化数据并确定某人是否患有疾病的模型。我认为问题在于培训和测试数据之间没有拆分数据。我不知道我将如何做到这一点。

我不确定该怎么做。

import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier



heart_data = pd.read_csv('cardio_train.csv')

heart_data.head()

heart_data.shape

heart_data.describe()

heart_data.isnull().sum()

heart_data_columns = heart_data.columns

predictors = heart_data[heart_data_columns[heart_data_columns != 'target']]  # all columns except Breast Cancer
target = heart_data['target']  # Breast Cancer column


#This function returns the first n rows for the object based on position. It is useful for quickly testing if your object has the right type
predictors.head()
target.head()

#normalize the data by subtracting the mean and dividing by the standard deviation.

predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()



n_cols = predictors_norm.shape[1]  # number of predictors


def regression_model():
    # create model
    model = Sequential()
    #inputs
    model.add(Dense(50, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(50, activation='relu')) # activation function
    model.add(Dense(1))

    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    #loss measures the results and figures out how bad it did. Optimizer generates next guess.
    return model


# build the model
model = regression_model()
print (model)
# fit the model
history=model.fit(predictors_norm, target, validation_split=0.3, epochs=10, verbose=2)


#Decision Tree
print ("Processing Decision Tree")
dtc = DecisionTreeClassifier()
dtc.fit(predictors_norm,target)
print("Decision Tree Test Accuracy {:.2f}%".format(dtc.score(predictors_norm, target)*100))


#Support Vector Machine
print ("Processing Support Vector Machine")
svm = SVC(random_state = 1)
svm.fit(predictors_norm, target)
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(svm.score(predictors_norm,target)*100))

#Random Forest
print ("Processing Random Forest")
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(predictors_norm, target)
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(rf.score(predictors_norm,target)*100))

我收到的消息是决策树测试准确度100.00％但是，支持向量机获得了73.37％

我不确定为什么决策树和随机森林显示100％的准确性？

0 个答案: