我正在尝试为Keras(link)实现弹性反向传播优化器,但具有挑战性的部分是能够根据每个参数的相应梯度是正,负还是零来执行更新。我编写了下面的代码作为实现Rprop优化器的开始。但是,我似乎无法找到单独访问参数的方法。循环遍历params
(如下面的代码所示)在每次迭代时返回p, g, g_old, s, wChangeOld
,这些都是矩阵。
有没有办法可以迭代各个参数并更新它们?如果我可以根据其渐变的符号索引参数向量,它也会起作用。
class Rprop(Optimizer):
def __init__(self, init_step=0.01, **kwargs):
super(Rprop, self).__init__(**kwargs)
self.init_step = K.variable(init_step, name='init_step')
self.iterations = K.variable(0., name='iterations')
self.posStep = 1.2
self.negStep = 0.5
self.minStep = 1e-6
self.maxStep = 50.
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
shapes = [K.get_variable_shape(p) for p in params]
stepList = [K.ones(shape)*self.init_step for shape in shapes]
wChangeOldList = [K.zeros(shape) for shape in shapes]
grads_old = [K.zeros(shape) for shape in shapes]
self.weights = stepList + grads_old + wChangeOldList
self.updates = []
for p, g, g_old, s, wChangeOld in zip(params, grads, grads_old,
stepList, wChangeOldList):
change = K.sign(g * g_old)
if change > 0:
s_new = K.minimum(s * self.posStep, self.maxStep)
wChange = s_new * K.sign(g)
g_new = g
elif change < 0:
s_new = K.maximum(s * self.posStep, self.maxStep)
wChange = - wChangeOld
g_new = 0
else:
s_new = s
wChange = s_new * K.sign(g)
g_new = p
self.updates.append(K.update(g_old, g_new))
self.updates.append(K.update(wChangeOld, wChange))
self.updates.append(K.update(s, s_new))
new_p = p - wChange
# Apply constraints
if p in constraints:
c = constraints[p]
new_p = c(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'init_step': float(K.get_value(self.init_step))}
base_config = super(Rprop, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
答案 0 :(得分:2)
我在Keras也在寻找RProp算法并找到了这个问题。我冒昧地根据我的目的调整你的代码,现在把它发布回来。到目前为止它似乎工作得很好,但我没有广泛地测试它。
免责声明:我对keras很新,但对theano(和块)有很多经验。此外,我仅使用theano作为后端进行测试,但不测试张量流。
class RProp(Optimizer):
def __init__(self, init_alpha=1e-3, scale_up=1.2, scale_down=0.5, min_alpha=1e-6, max_alpha=50., **kwargs):
super(RProp, self).__init__(**kwargs)
self.init_alpha = K.variable(init_alpha, name='init_alpha')
self.scale_up = K.variable(scale_up, name='scale_up')
self.scale_down = K.variable(scale_down, name='scale_down')
self.min_alpha = K.variable(min_alpha, name='min_alpha')
self.max_alpha = K.variable(max_alpha, name='max_alpha')
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
shapes = [K.get_variable_shape(p) for p in params]
alphas = [K.variable(numpy.ones(shape) * self.init_alpha) for shape in shapes]
old_grads = [K.zeros(shape) for shape in shapes]
self.weights = alphas + old_grads
self.updates = []
for param, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
new_alpha = K.switch(
K.greater(grad * old_grad, 0),
K.minimum(alpha * self.scale_up, self.max_alpha),
K.maximum(alpha * self.scale_down, self.min_alpha)
)
new_param = param - K.sign(grad) * new_alpha
# Apply constraints
if param in constraints:
c = constraints[param]
new_param = c(new_param)
self.updates.append(K.update(param, new_param))
self.updates.append(K.update(alpha, new_alpha))
self.updates.append(K.update(old_grad, grad))
return self.updates
def get_config(self):
config = {
'init_alpha': float(K.get_value(self.init_alpha)),
'scale_up': float(K.get_value(self.scale_up)),
'scale_down': float(K.get_value(self.scale_down)),
'min_alpha': float(K.get_value(self.min_alpha)),
'max_alpha': float(K.get_value(self.max_alpha)),
}
base_config = super(RProp, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
重要说明:
关于您的代码的一些注释(指您的原始变量名称):
wChange
从不在迭代中使用,因此您不需要将它们存储在永久变量中。change > 0
没有按照您的想法执行,因为change
是一个张量变量。你想要的是元素比较,而是使用K.switch()
。maxStep
两次而不是另一次使用minStep
。change
为零的情况可以忽略不计,因为在实践中几乎不会发生这种情况。g_new = 0
和g_new = p
都是完全虚假的,应该像第一个if分支一样g_new = g
。答案 1 :(得分:1)
我是keras和Python的新手,但我为了我的目的修改了上面的代码。
由于使用全批量学习和偏导数,它是非常快速和简单的算法。在我的测试中,它优于所有其他反向传播算法,包括Adam。我用Tensorflow和CNTK作为后端进行了测试。
没有重量回溯的修改后的Rprop: https://pdfs.semanticscholar.org/df9c/6a3843d54a28138a596acc85a96367a064c2.pdf
class iRprop_(Optimizer):
def __init__(self, init_alpha=0.01, scale_up=1.2, scale_down=0.5, min_alpha=0.00001, max_alpha=50., **kwargs):
super(iRprop_, self).__init__(**kwargs)
self.init_alpha = K.variable(init_alpha, name='init_alpha')
self.scale_up = K.variable(scale_up, name='scale_up')
self.scale_down = K.variable(scale_down, name='scale_down')
self.min_alpha = K.variable(min_alpha, name='min_alpha')
self.max_alpha = K.variable(max_alpha, name='max_alpha')
def get_updates(self, params, loss):
grads = self.get_gradients(loss, params)
shapes = [K.get_variable_shape(p) for p in params]
alphas = [K.variable(K.ones(shape) * self.init_alpha) for shape in shapes]
old_grads = [K.zeros(shape) for shape in shapes]
self.weights = alphas + old_grads
self.updates = []
for p, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
grad = K.sign(grad)
new_alpha = K.switch(
K.greater(grad * old_grad, 0),
K.minimum(alpha * self.scale_up, self.max_alpha),
K.switch(K.less(grad * old_grad, 0),K.maximum(alpha * self.scale_down, self.min_alpha),alpha)
)
grad = K.switch(K.less(grad * old_grad, 0),K.zeros_like(grad),grad)
new_p = p - grad * new_alpha
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
self.updates.append(K.update(alpha, new_alpha))
self.updates.append(K.update(old_grad, grad))
return self.updates
def get_config(self):
config = {
'init_alpha': float(K.get_value(self.init_alpha)),
'scale_up': float(K.get_value(self.scale_up)),
'scale_down': float(K.get_value(self.scale_down)),
'min_alpha': float(K.get_value(self.min_alpha)),
'max_alpha': float(K.get_value(self.max_alpha)),
}
base_config = super(iRprop_, self).get_config()
return dict(list(base_config.items()) + list(config.items()))