我想在sklearn分类器上实现adaboost分类器,在alg分类器的第一步中我应该将权重初始化为" 1 /#训练数据" 但它会降低分类器的准确性,我不知道为什么? (我为所有数据点设置相同的权重)
我的代码:
svm_weight = SVC()
svm_non_weight = SVC()
w = np.ones(len(target_train))
w.fill(float(1)/float(len(target_train)))
svm_weight.fit(data_train_feature_scaled_pca,
target_train,
sample_weight= w)
svm_non_weight.fit(data_train_feature_scaled_pca,
target_train)
print "score weight : ",svm_weight.score(data_test_feature_scaled_pca,target_train)
print "score non weight : ",svm_non_weight.score(data_test_feature_scaled_pca,target_train)
输出:
score weight : 0.503592561285
score non weight : 0.729289940828
实施了adaboost:
class adaboost_classifier:
def __init__(self,train,target,classifier,n_estimator):
#prepairing dataset
self.N_classes = np.unique(target)
self.n_estimator = n_estimator
self.N_data = len(train)
self.trained_classifier = [[classifier,float(0),float(0), True ] for i in range(n_estimator)]
indice = []
train = np.array(train)
target = np.array(target)
dataset = np.concatenate((train,target),axis=1)
#join train and target for boosting
for i in range(len(dataset[0])-1):
indice.append(i)
self.weights = np.zeros([n_estimator,self.N_data])
# init 1/n value for weights
self.weights.fill(1/float(self.N_data))
#take sampling
new_dataset = dataset
self.N_data = len(new_dataset)
#start training sub classifier
for i in range(n_estimator):
self.loss = np.zeros(self.N_data)
#seprating training and target data
new_train = new_dataset[:,indice]
new_target = new_dataset[:,(len(dataset[0])-1)]
#train the classifier : learn f(X) with data weight
self.trained_classifier[i][0].fit(new_train,new_target,sample_weight=self.weights[i])
#computed the weighted error wich is store in trained_classifier[i][1]
for point in range(self.N_data) :
if(self.trained_classifier[i][0].predict([new_train[point]]) != new_target[point]):
self.loss[point] = 1
self.trained_classifier[i][1] += self.weights[i][point]
#compute coefficient of classifier i wich is store in trained_classifier[i][2]
self.trained_classifier[i][2] = 0.5 * np.log((1-self.trained_classifier[i][1])/self.trained_classifier[i][1])
#recompute weights
for j in range(self.N_data):
if(self.loss[j] == 1):
self.weights[i][j] *= np.exp(self.trained_classifier[i][2])
else:
self.weights[i][j] *= np.exp(-self.trained_classifier[i][2])
#normalizing the weights
self.trained_classifier[i][1] = self.trained_classifier[i][1] / self.weights[i].sum()