我正在使用一个小数据集,并使用以下代码。即使使用带有网格搜索的投票分类器,精度也接近30%。这是数据集的链接。 链接:https://drive.google.com/open?id=1rVdhrhrXZtGvAyUrGuUheRRpMtLQJxSu
我至少要求准确度在60%左右。不知道,完全被困住了。我必须使用具有最佳重量的投票分类器。我有所有的代码,但不知道我失踪的地方。
请你建议我,我可以做多少处理步骤。或者我的代码有问题。
# Imports
import warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 12
def fxn():
warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fxn()
import seaborn as sns
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint
from sklearn import preprocessing
# Machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv("Employees.csv")
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df.company = le.fit_transform(df.company)
df.age = le.fit_transform(df.age)
df.sex = le.fit_transform(df.sex)
df.qualification = le.fit_transform(df.qualification)
df.experience = le.fit_transform(df.experience)
df.customers = le.fit_transform(df.customers)
df.interesting = le.fit_transform(df.interesting)
df.sources = le.fit_transform(df.sources)
df.usage = le.fit_transform(df.usage)
df.devices = le.fit_transform(df.devices)
print('Splitting data into training and testing')
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:15], df.iloc[:, 15], test_size=0.3, random_state=10)
from sklearn.model_selection import GridSearchCV, cross_val_score
X = np.array(df.iloc[:, 0:15])
y = np.array(df.iloc[:, 15]).astype(int)
clf1 = LogisticRegression()
clf2 = SVC(kernel = 'rbf', C = 1000, gamma = 0.001, probability=True)
clf3 = DecisionTreeClassifier()
clf4 = RandomForestClassifier(random_state=0, n_estimators=300, bootstrap = False, min_samples_leaf = 7,
min_samples_split = 7, max_features = 10, max_depth = None, criterion = 'gini')
clf5 = GradientBoostingClassifier(random_state=1, n_estimators=100, learning_rate = 0.1,
min_samples_leaf = 20, min_samples_split = 3, max_features = 6, max_depth = 16)
clf6 = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', base_estimator = RidgeClassifierCV(), learning_rate = 0.5)
clf7 = BaggingClassifier(n_estimators=100, base_estimator = KNeighborsClassifier(n_neighbors = 5), max_features = 6)
clf_df = pd.DataFrame(columns=['w1', 'w2', 'w3', 'w4', 'w5', 'w7', 'mean', 'std'])
i = 0
for w1 in range(1,4):
for w2 in range(1,4):
for w3 in range(1,4):
for w4 in range(1,4):
for w5 in range(1,4):
for w7 in range(1,4):
if len(set((w1,w2,w3,w4,w5,w7))) == 1: # skip if all weights are equal
continue
eclf = VotingClassifier(estimators=[('lr', clf1), ('svc', clf2), ('knn', clf3), ('rf', clf4),
('gb', clf5), ('bagg', clf7)],
voting='soft', weights=[w1,w2,w3,w4,w5,w7])
scores = cross_val_score(estimator=eclf,
X=X,
y=y,
cv=3,
scoring='accuracy',
n_jobs=1)
clf_df.loc[i] = [w1, w2, w3, w4, w5, w7, scores.mean(), scores.std()]
i += 1
clf_df
w1 w2 w3 w4 w5 w7 mean std
0 1.0 1.0 1.0 1.0 1.0 2.0 0.320952 0.061594
1 1.0 1.0 1.0 1.0 1.0 3.0 0.304339 0.048603
2 1.0 1.0 1.0 1.0 2.0 1.0 0.298244 0.064792
3 1.0 1.0 1.0 1.0 2.0 2.0 0.298244 0.064792
4 1.0 1.0 1.0 1.0 2.0 3.0 0.288272 0.050571
5 1.0 1.0 1.0 1.0 3.0 1.0 0.307880 0.052788
6 1.0 1.0 1.0 1.0 3.0 2.0 0.287831 0.037529
7 1.0 1.0 1.0 1.0 3.0 3.0 0.309547 0.044869
8 1.0 1.0 1.0 2.0 1.0 1.0 0.312202 0.050178
9 1.0 1.0 1.0 2.0 1.0 2.0 0.318735 0.083757
10 1.0 1.0 1.0 2.0 1.0 3.0 0.313089 0.045554
11 1.0 1.0 1.0 2.0 2.0 1.0 0.323947 0.044287
12 1.0 1.0 1.0 2.0 2.0 2.0 0.305666 0.050826
13 1.0 1.0 1.0 2.0 2.0 3.0 0.339127 0.048330
14 1.0 1.0 1.0 2.0 3.0 1.0 0.288714 0.063720
15 1.0 1.0 1.0 2.0 3.0 2.0 0.312644 0.063469
16 1.0 1.0 1.0 2.0 3.0 3.0 0.311316 0.058367
17 1.0 1.0 1.0 3.0 1.0 1.0 0.330479 0.077330
18 1.0 1.0 1.0 3.0 1.0 2.0 0.340896 0.064767
19 1.0 1.0 1.0 3.0 1.0 3.0 0.311316 0.058367
20 1.0 1.0 1.0 3.0 2.0 1.0 0.305225 0.037695
21 1.0 1.0 1.0 3.0 2.0 2.0 0.338685 0.036169
22 1.0 1.0 1.0 3.0 2.0 3.0 0.340454 0.051494
23 1.0 1.0 1.0 3.0 3.0 1.0 0.318297 0.038370
24 1.0 1.0 1.0 3.0 3.0 2.0 0.300458 0.056722
25 1.0 1.0 1.0 3.0 3.0 3.0 0.319180 0.064249
26 1.0 1.0 2.0 1.0 1.0 1.0 0.230885 0.021098
27 1.0 1.0 2.0 1.0 1.0 2.0 0.209067 0.038850
28 1.0 1.0 2.0 1.0 1.0 3.0 0.242626 0.048283
29 1.0 1.0 2.0 1.0 2.0 1.0 0.283509 0.036564
... ... ... ... ... ... ... ... ...
696 3.0 3.0 2.0 3.0 2.0 3.0 0.313089 0.045554
697 3.0 3.0 2.0 3.0 3.0 1.0 0.320949 0.086164
698 3.0 3.0 2.0 3.0 3.0 2.0 0.327485 0.089039
699 3.0 3.0 2.0 3.0 3.0 3.0 0.320507 0.073512
700 3.0 3.0 3.0 1.0 1.0 1.0 0.249162 0.048175
701 3.0 3.0 3.0 1.0 1.0 2.0 0.255600 0.033159
702 3.0 3.0 3.0 1.0 1.0 3.0 0.244844 0.037530
703 3.0 3.0 3.0 1.0 2.0 1.0 0.293926 0.023029
704 3.0 3.0 3.0 1.0 2.0 2.0 0.271768 0.018016
705 3.0 3.0 3.0 1.0 2.0 3.0 0.283509 0.036564
706 3.0 3.0 3.0 1.0 3.0 1.0 0.271323 0.028955
707 3.0 3.0 3.0 1.0 3.0 2.0 0.267001 0.034206
708 3.0 3.0 3.0 1.0 3.0 3.0 0.289603 0.028225
709 3.0 3.0 3.0 2.0 1.0 1.0 0.255257 0.037280
710 3.0 3.0 3.0 2.0 1.0 2.0 0.243513 0.041866
711 3.0 3.0 3.0 2.0 1.0 3.0 0.276192 0.068067
712 3.0 3.0 3.0 2.0 2.0 1.0 0.318297 0.038370
713 3.0 3.0 3.0 2.0 2.0 2.0 0.271765 0.042210
714 3.0 3.0 3.0 2.0 2.0 3.0 0.269106 0.073391
715 3.0 3.0 3.0 2.0 3.0 1.0 0.318738 0.051217
716 3.0 3.0 3.0 2.0 3.0 2.0 0.288272 0.050571
717 3.0 3.0 3.0 2.0 3.0 3.0 0.311320 0.023631
718 3.0 3.0 3.0 3.0 1.0 1.0 0.226563 0.028543
719 3.0 3.0 3.0 3.0 1.0 2.0 0.277418 0.019540
720 3.0 3.0 3.0 3.0 1.0 3.0 0.224791 0.034766
721 3.0 3.0 3.0 3.0 2.0 1.0 0.275204 0.021578
722 3.0 3.0 3.0 3.0 2.0 2.0 0.285278 0.058999
723 3.0 3.0 3.0 3.0 2.0 3.0 0.271765 0.042210
724 3.0 3.0 3.0 3.0 3.0 1.0 0.273534 0.063532
725 3.0 3.0 3.0 3.0 3.0 2.0 0.294363 0.071240
先谢谢。
答案 0 :(得分:1)
您的数据集和方法存在一些问题:
<强> 1。你有很少的观察和很多课程。如果您致电y_train.value_counts()
,则会看到
16 22
18 17
15 17
17 13
12 11
13 8
19 7
10 7
14 6
11 6
8 5
9 1
4 1
这意味着最频繁的班级只有22个观察,而不频繁的班级只有一个观察。总的来说,每121次观察你有13个班级,这个班级太少了。上采样技术没有帮助,因为它们不会增加可用信息,而只是重新加权。
一些可能的建议是:
statsmodels
中有一个有序的logit模型) <强> 2。您使用任意数字编码分类变量。这是使用LabelEncoder
的工件。例如。您将company4
编码为3,将company3
编码为2,将company2
编码为1,将company1
编码为0.因此,您的所有分类符&#34;认为&#34;例如company2
在company1
和company3
之间的某种意义上来说['10 To 14 Years', '15 To 20 Years', '5 To 9 Years', 'Less Than 5 Years', 'More Than 20 Years']
,在某种意义上可能并非如此。此外,您将年龄[0, 1, 2, 3, 4]
编码为SVC
,这完全忽略了年龄段的真实差异。
你能做些什么:
第3。您可以使用基于距离算法的非缩放功能。 KNeighborsClassifier
和sex
使用观察之间的欧几里德距离进行训练和预测。但是,只有当特征具有与其重要性相对应的尺度时,欧几里德距离才真正反映(dis)相似性。在您的数据中,exf1
的值在{1,2}中,而exf1
的值在[4-20]中。就距离而言,这意味着sex
比StandardScaler
重要16倍。
对于LogisticRegression,可以说是7中最强大的分类器,特征缩放也很重要,因为它根据它们的大小对其系数进行了规则化(缩小)。
如果预期所有特征同样重要,则它们需要具有相同的比例,例如,来自sklearn
的{{1}}。
总而言之,不是选择最佳算法,而是专注于您的数据,预处理以及您想要完成的任务。