我正在尝试根据Andrew Ng的演讲笔记实现稀疏自动编码器,如here所示。 它要求通过引入惩罚项(K-L发散)在自动编码器层上应用稀疏性约束。在经过一些小的更改后,我尝试使用提供的方向here来实现此功能。 这是由SparseActivityRegularizer类实现的K-L分歧和稀疏性惩罚项,如下所示。
def kl_divergence(p, p_hat):
return (p * K.log(p / p_hat)) + ((1-p) * K.log((1-p) / (1-p_hat)))
class SparseActivityRegularizer(Regularizer):
sparsityBeta = None
def __init__(self, l1=0., l2=0., p=-0.9, sparsityBeta=0.1):
self.p = p
self.sparsityBeta = sparsityBeta
def set_layer(self, layer):
self.layer = layer
def __call__(self, loss):
#p_hat needs to be the average activation of the units in the hidden layer.
p_hat = T.sum(T.mean(self.layer.get_output(True) , axis=0))
loss += self.sparsityBeta * kl_divergence(self.p, p_hat)
return loss
def get_config(self):
return {"name": self.__class__.__name__,
"p": self.l1}
模型就是这样构建的
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
autoencoder = Sequential()
encoder = containers.Sequential([Dense(250, input_dim=576, init='glorot_uniform', activation='tanh',
activity_regularizer=SparseActivityRegularizer(p=-0.9, sparsityBeta=0.1))])
decoder = containers.Sequential([Dense(576, input_dim=250)])
autoencoder.add(AutoEncoder(encoder=encoder, decoder=decoder, output_reconstruction=True))
autoencoder.layers[0].build()
autoencoder.compile(loss='mse', optimizer=SGD(lr=0.001, momentum=0.9, nesterov=True))
loss = autoencoder.fit(X_train_tmp, X_train_tmp, nb_epoch=200, batch_size=800, verbose=True, show_accuracy=True, validation_split = 0.3)
autoencoder.save_weights('SparseAutoEncoder.h5',overwrite = True)
result = autoencoder.predict(X_test)
当我调用fit()函数时,我得到负损耗值,输出根本不像输入。我想知道我哪里出错了。计算图层平均激活率并使用此自定义稀疏度正则化程序的正确方法是什么?任何形式的帮助将不胜感激。谢谢!
我正在使用Keras 0.3.1和Python 2.7,因为最新的Keras(1.0.1)版本没有自动编码器层。
答案 0 :(得分:0)
您已经定义了self.p = -0.9而不是原始海报和您所引用的讲义所使用的0.05值。
答案 1 :(得分:0)
我纠正了一些错误:
class SparseRegularizer(keras.regularizers.Regularizer):
def __init__(self, rho = 0.01,beta = 1):
"""
rho : Desired average activation of the hidden units
beta : Weight of sparsity penalty term
"""
self.rho = rho
self.beta = beta
def __call__(self, activation):
rho = self.rho
beta = self.beta
# sigmoid because we need the probability distributions
activation = tf.nn.sigmoid(activation)
# average over the batch samples
rho_bar = K.mean(activation, axis=0)
# Avoid division by 0
rho_bar = K.maximum(rho_bar,1e-10)
KLs = rho*K.log(rho/rho_bar) + (1-rho)*K.log((1-rho)/(1-rho_bar))
return beta * K.sum(KLs) # sum over the layer units
def get_config(self):
return {
'rho': self.rho,
'beta': self.beta
}