不知道问题是否与我分割数据集的方式有关,或者我做错了什么,但每次运行de程序时,我都会得到不同的准确度。有人可以帮我找出问题吗?谢谢 这是我的代码:
import pandas as pd
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
# load the data
from sklearn.tree import DecisionTreeClassifier
# url = "data/lung-cancer.data"
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/lung- cancer/lung-cancer.data"
data_set = pd.read_csv(url)
def clean_data(data_set):
# replace the ? with NaN
data_set = data_set.convert_objects(convert_numeric=True)
# replace the NaN with the average of the row
data_set = data_set.fillna(data_set.mean(axis=0), axis=0)
return data_set
data_set = clean_data(data_set)
def split_data(data_set):
# split the data in two parts train(80%), test(20%)
train, test = train_test_split(data_set.values, test_size=0.2)
# first column of the data are labels
labels_test = test[:, :1]
labels_train = train[:, :1]
# the rest of the columns are features
features_test = test[:, 1:]
features_train = train[:, 1:]
return features_train, labels_train, features_test, labels_test
features_train, labels_train, features_test, labels_test = split_data(data_set)
"""
print(labels_train)
print(features_train)
print(features_test)
print(labels_test)
"""
# Modeling step Test different algorithms
random_state = 2
classifiers = [
GaussianNB(),
KNeighborsClassifier(n_neighbors=3),
KNeighborsClassifier(n_neighbors=5),
SVC(kernel="poly", C=0.4, probability=True),
DecisionTreeClassifier(random_state=3),
RandomForestClassifier(random_state=3),
AdaBoostClassifier(random_state=3),
ExtraTreesClassifier(random_state=3),
GradientBoostingClassifier(random_state=3),
MLPClassifier(random_state=random_state)
]
accuracy_res = []
algorithm_res = []
for clf in classifiers:
clf.fit(features_train, labels_train)
name = clf.__class__.__name__
train_predictions = clf.predict(features_test)
accuracy = accuracy_score(labels_test, train_predictions)
print(name, "{:.4%}".format(accuracy))
accuracy_res.append(accuracy)
algorithm_res.append(name)
print()
y_pos = np.arange(len(algorithm_res))
plt.barh(y_pos, accuracy_res, align='center', alpha=0.5)
plt.yticks(y_pos, algorithm_res)
plt.xlabel('Accuracy')
plt.title('Algorithms')
plt.show()
以下是我得到的结果: 第一个结果
GaussianNB 28.5714%
KNeighborsClassifier 57.1429%
KNeighborsClassifier 71.4286%
SVC 57.1429%
DecisionTreeClassifier 57.1429%
RandomForestClassifier 42.8571%
AdaBoostClassifier 42.8571%
ExtraTreesClassifier 42.8571%
GradientBoostingClassifier 57.1429%
MLPClassifier 57.1429%
第二个结果
GaussianNB 28.5714%
KNeighborsClassifier 42.8571%
KNeighborsClassifier 28.5714%
SVC 57.1429%
DecisionTreeClassifier 28.5714%
RandomForestClassifier 57.1429%
AdaBoostClassifier 57.1429%
ExtraTreesClassifier 42.8571%
GradientBoostingClassifier 28.5714%
MLPClassifier 57.1429%
第三个结果
GaussianNB 71.4286%
KNeighborsClassifier 71.4286%
KNeighborsClassifier 71.4286%
SVC 28.5714%
DecisionTreeClassifier 28.5714%
RandomForestClassifier 57.1429%
AdaBoostClassifier 71.4286%
ExtraTreesClassifier 57.1429%
GradientBoostingClassifier 28.5714%
MLPClassifier 28.5714%
答案 0 :(得分:1)
由于您使用的是train_test_split,它会随机分割您的数据,这会导致每次运行上述代码时的准确性不同。
我建议多次查看输出,并从多个输出中找出准确度的平均值。你可以重定向输出,让python为你做。采用模型给出最高的平均精度。
当我运行你的代码时,在使用带有n_neighbors = 5的KNeighborsClassifier时,我获得了最佳的准确性。我也做了一些修改,所以没有警告。请在下面找到更新的代码。我已经更新了评论,只要有修改供参考。
import pandas as pd
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
# load the data
from sklearn.tree import DecisionTreeClassifier
# url = "data/lung-cancer.data"
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.data"
data_set = pd.read_csv(url)
def clean_data(data_set):
# replace the ? with NaN
# data_set = data_set.convert_objects(convert_numeric=True)
# convert objects is deprecated
data_set = data_set.apply(pd.to_numeric, errors='coerce')
# replace the NaN with the average of the row
data_set = data_set.fillna(data_set.mean(axis=0), axis=0)
return data_set
data_set = clean_data(data_set)
def split_data(data_set):
# split the data in two parts train(80%), test(20%)
train, test = train_test_split(data_set.values, test_size=0.2)
# first column of the data are labels
labels_test = test[:, :1]
labels_train = train[:, :1]
# the rest of the columns are features
features_test = test[:, 1:]
features_train = train[:, 1:]
return features_train, labels_train, features_test, labels_test
features_train, labels_train, features_test, labels_test = split_data(data_set)
"""
print(labels_train)
print(features_train)
print(features_test)
print(labels_test)
"""
# Modeling step Test different algorithms
random_state = 2
classifiers = [
GaussianNB(),
KNeighborsClassifier(n_neighbors=3),
KNeighborsClassifier(n_neighbors=5),
SVC(kernel="poly", C=0.4, probability=True),
DecisionTreeClassifier(random_state=3),
RandomForestClassifier(random_state=3),
AdaBoostClassifier(random_state=3),
ExtraTreesClassifier(random_state=3),
GradientBoostingClassifier(random_state=3),
# MLPClassifier(random_state=random_state)
# Set hidden_layer_sizes and max_iter parameters
# so that multilayer perceptron will converge
MLPClassifier(solver='lbfgs', hidden_layer_sizes=[100], max_iter=2000, activation='logistic', random_state=random_state)
]
accuracy_res = []
algorithm_res = []
for clf in classifiers:
# clf.fit(features_train, labels_train)
# Added ravel to convert column vector to 1d array
clf.fit(features_train, labels_train.ravel())
name = clf.__class__.__name__
train_predictions = clf.predict(features_test)
accuracy = accuracy_score(labels_test, train_predictions)
print(name, "{:.4%}".format(accuracy))
accuracy_res.append(accuracy)
algorithm_res.append(name)
print()
y_pos = np.arange(len(algorithm_res))
plt.barh(y_pos, accuracy_res, align='center', alpha=0.5)
plt.yticks(y_pos, algorithm_res)
plt.xlabel('Accuracy')
plt.title('Algorithms')
plt.show()
答案 1 :(得分:0)
{{1}}
您使用了sklearn的train_test_split,它将您的数据拆分为Train_set和Test_set 随机。因此,每次重新训练模型时,数据都与其他版本的数据不相似。
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
答案 2 :(得分:0)
更改此行
train, test = train_test_split(data_set.values, test_size=0.2)
到
train, test = train_test_split(data_set.values, test_size=0.2,random_state=0)
random_state的值不必一定为0,它可以是1或2或42。只要每次分裂发生时它的值都相同即可。然后,不同的运行将为您提供一致的结果。