我正在尝试使用Kaggle的泰坦尼克号问题学习K-fold。
在我的下面我使用了两种带决策树的模型。
1) clf_tuned = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
max_features=None, max_leaf_nodes=None, min_samples_leaf=10,
min_samples_split=10, min_weight_fraction_leaf=0.0,
presort=False, random_state=100, splitter='random')
2)clf_gini = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
max_depth=3, min_samples_leaf=5)
model=clf_gini
results = cross_val_score(model, X_train, Y_train, cv=kfold)
print(results.mean()*100.0)
print(results.std()*100.0)
第一模型的K折平均得分为81.14,标准为0.82,但公共领导委员会得分为0.75598, 第二个模型的K折平均得分为80.92,标准为1.56,但公共领导委员会得分为0.80861。
要信任哪种模式?
我的代码如下:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import preprocessing
from sklearn.tree import export_graphviz
from sklearn.model_selection import KFold,cross_val_score
%matplotlib inline
train = pd.read_csv("C:\\Users\\user\\Titanic\\train.csv")
test = pd.read_csv("C:\\Users\\user\\Titanic\\test.csv")
# store it as Y
Y_train = train["Survived"]
train.drop(["Survived"], axis=1, inplace=True)
num_train = len(train)
all_data = pd.concat([train, test])
# Populating null fare value with median of train set
all_data["Fare"]=all_data["Fare"].fillna(train["Fare"].median())
# Populating null age value with median of train set
#all_data["Age"]=all_data["Age"].fillna(train["Age"].median())
# Populating missing embarked with most frequent value - S
all_data["Embarked"]=all_data["Embarked"].fillna("S")
# Creating new feature as Title
all_data['Title'] = all_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# Converting sex into binary
sex_mapping = {"male": 0, "female": 1}
all_data['Sex'] = all_data['Sex'].map(sex_mapping)
guess_age=all_data.groupby(['Title','Pclass','Sex'])['Age'].agg(['mean','count']).reset_index()
guess_age.columns= ['Title','Pclass','Sex','ga_mean','ga_cnt']
guess_age["ga_mean"]=guess_age["ga_mean"].fillna(28)
guess_age["ga_mean"]=guess_age["ga_mean"].astype(int)
all_data=all_data.merge(guess_age, how='left')
all_data.loc[(all_data.Age.isnull()),"Age"]=all_data[(all_data.Age.isnull())].ga_mean
# Drop few columns due to over-fitting
all_data.drop(["Cabin","Name","Ticket","PassengerId","ga_mean","ga_cnt"], axis=1, inplace=True)
all_data = pd.get_dummies(all_data)
X_train = all_data[:num_train]
X_test = all_data[num_train:]
clf_tuned = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
max_features=None, max_leaf_nodes=None, min_samples_leaf=10,
min_samples_split=10, min_weight_fraction_leaf=0.0,
presort=False, random_state=100, splitter='random')
kfold = KFold(n_splits=3)
model = clf_tuned
results = cross_val_score(model, X_train, Y_train, cv=kfold)
print(results.mean()*100.0)
print(results.std()*100.0)
clf_gini = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
max_depth=3, min_samples_leaf=5)
model=clf_gini
results = cross_val_score(model, X_train, Y_train, cv=kfold)
print(results.mean()*100.0)
print(results.std()*100.0)