我目前正在研究一个读取结构化数据并确定某人是否患有疾病的模型。我认为问题在于培训和测试数据之间没有拆分数据。我不知道我将如何做到这一点。
我不确定该怎么做。
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
heart_data = pd.read_csv('cardio_train.csv')
heart_data.head()
heart_data.shape
heart_data.describe()
heart_data.isnull().sum()
heart_data_columns = heart_data.columns
predictors = heart_data[heart_data_columns[heart_data_columns != 'target']] # all columns except Breast Cancer
target = heart_data['target'] # Breast Cancer column
#This function returns the first n rows for the object based on position. It is useful for quickly testing if your object has the right type
predictors.head()
target.head()
#normalize the data by subtracting the mean and dividing by the standard deviation.
predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()
n_cols = predictors_norm.shape[1] # number of predictors
def regression_model():
# create model
model = Sequential()
#inputs
model.add(Dense(50, activation='relu', input_shape=(n_cols,)))
model.add(Dense(50, activation='relu')) # activation function
model.add(Dense(1))
# compile model
model.compile(optimizer='adam', loss='mean_squared_error')
#loss measures the results and figures out how bad it did. Optimizer generates next guess.
return model
# build the model
model = regression_model()
print (model)
# fit the model
history=model.fit(predictors_norm, target, validation_split=0.3, epochs=10, verbose=2)
#Decision Tree
print ("Processing Decision Tree")
dtc = DecisionTreeClassifier()
dtc.fit(predictors_norm,target)
print("Decision Tree Test Accuracy {:.2f}%".format(dtc.score(predictors_norm, target)*100))
#Support Vector Machine
print ("Processing Support Vector Machine")
svm = SVC(random_state = 1)
svm.fit(predictors_norm, target)
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(svm.score(predictors_norm,target)*100))
#Random Forest
print ("Processing Random Forest")
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(predictors_norm, target)
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(rf.score(predictors_norm,target)*100))
我收到的消息是 决策树测试准确度100.00% 但是,支持向量机获得了73.37%