Question

我正在通过遵循Adamax示例here

来实现自定义的Tensorflow优化器。

研究了代码之后，我知道AdamaxOptimizer中的以下方法负责更新权重。

def _apply_dense(self, grad, var):
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    ...
    ....
    return control_flow_ops.group(*[var_update, v_t, m_t])

要测试我的理解是否正确，我将var_update更改为全1，以查看权重是否全部为1。因此该方法现在返回control_flow_ops.group(*[tf.ones(var_update.shape), v_t, m_t])。

我可以看到我的网络停止改善，但是权重不完全是1。

重量= [-0.00534745 0.00561042 0.05048206 0.06062614   0.04498126 0.07942017     0.02884914 -0.06069052 0.0710757 -0.03092324] acc = 0.120899997651577

重量= [-0.00534745 0.00561042 0.05048206 0.06062614   0.04498126 0.07942017     0.02884914 -0.06069052 0.0710757 -0.03092324] acc = 0.120899997651577

重量= [-0.00534745 0.00561042 0.05048206 0.06062614   0.04498126 0.07942017     0.02884914 -0.06069052 0.0710757 -0.03092324] acc = 0.120899997651577

...

我的问题

Optimizer返回的值已经不是新的权重了吗？但是为什么在尝试打印权重时为什么看不到期望值1？

有人可以解释control_flow_ops.group在做什么吗？当我查看文档时，它只是说：

创建一个将多个操作分组的操作。

我的完整代码

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import clip_ops
from tensorflow.python.framework import ops
from tensorflow.python.training import optimizer

os.environ['CUDA_VISIBLE_DEVICES']='3'

config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1

class AdamaxOptimizer(optimizer.Optimizer):
    """Optimizer that implements the Adamax algorithm.
    See [Kingma et. al., 2014](http://arxiv.org/abs/1412.6980)
    ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
    Gradient clipping is introduced in [Freivalds et. al., 2017](https://arxiv.org/abs/1702.08727)
    ([pdf](https://arxiv.org/pdf/1702.08727)).   
    This class provides lazy handling of gradient updates for sparse variables.
    It only updates moving-average accumulators for sparse variable indices that
    appear in the current batch, rather than updating the accumulators for all
    indices. 
    """

    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, clip_gradients = True, clip_multiplier=1.2, clip_epsilon=1e-4, use_locking=False, name="Adamax"):
        """Construct a new AdaMax optimizer.

        Args:
            learning_rate: A Tensor or a floating point value. The learning rate.
            beta1: A float value or a constant float tensor.
              The exponential decay rate for the 1st moment estimates.
            beta2: A float value or a constant float tensor.
              The exponential decay rate for the 2nd moment estimates.
            epsilon: A small constant for numerical stability.
            clip_gradients: Whether to perform gradient clipping
            clip_multiplier: Multiplier for second moment estimate for gradient clipping. Should be >1. Large values correspond to less clipping.
            clip_epsilon: Gradients smaller than this are not clipped. Also, it has some effect on the first few optimization steps.
            use_locking: If True use locks for update operations.
            name: Optional name for the operations created when applying gradients.
              Defaults to "Adamax".
        """

        super(AdamaxOptimizer, self).__init__(use_locking, name)
        self._lr = learning_rate
        self._beta1 = beta1
        self._beta2 = beta2
        self._epsilon = epsilon
        self.clip_multiplier = clip_multiplier
        self.clip_epsilon = clip_epsilon
        self.clip_gradients = clip_gradients

        # Tensor versions of the constructor arguments, created in _prepare().
        self._lr_t = None
        self._beta1_t = None
        self._beta2_t = None
        self._epsilon_t = None
        self.clip_multiplier_t = None
        self.clip_epsilon_t = None

    def _prepare(self):
        self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate")
        self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1")
        self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2")
        self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon")
        self.clip_multiplier_t = ops.convert_to_tensor(self.clip_multiplier, name="clip_multiplier")
        self.clip_epsilon_t = ops.convert_to_tensor(self.clip_epsilon, name="clip_epsilon")

    def _create_slots(self, var_list):
        # Create slots for the first and second moments.
        for v in var_list:
            self._zeros_slot(v, "m", self._name)
            self._zeros_slot(v, "v", self._name)

    def _apply_dense(self, grad, var):
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
        clip_multiplier_t = math_ops.cast(self.clip_multiplier_t, var.dtype.base_dtype)
        clip_epsilon_t = math_ops.cast(self.clip_epsilon_t, var.dtype.base_dtype)

        v = self.get_slot(var, "v")
        # clip gradient so that each value exceeds its previous maximum by no more than clip_multiplier
        if self.clip_gradients:
            clipVal = v * clip_multiplier_t + clip_epsilon_t
            grad = clip_ops.clip_by_value(grad, -clipVal, clipVal)

        # m := beta1 * m + (1 - beta1) * g_t

        m = self.get_slot(var, "m")
        m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking)
        # v := max(beta2 * v , abs(grad))
        v_t = state_ops.assign(v,math_ops.maximum(beta2_t * v, math_ops.abs(grad)), use_locking=self._use_locking)
        # variable -= learning_rate * m_t / (epsilon_t + v_t)
        # we do not use bias-correction term for the first moment; it does not give observable benefit
        var_update = state_ops.assign_sub(var, lr_t * m_t / (v_t+epsilon_t), use_locking=self._use_locking)

        return control_flow_ops.group(*[tf.ones(var_update.shape), v_t, m_t])


mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

X = tf.placeholder(tf.float32, [None, 784])
y_true = tf.placeholder(tf.int32, [None, 10]) 

output = tf.layers.dense(X, 10, name='dense1')

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(y_true, output))

optimizer = AdamaxOptimizer()
train = optimizer.minimize(cross_entropy)

init = tf.global_variables_initializer()

sess = tf.Session(config=config)
sess.run(init)

weights1 = [v for v in tf.trainable_variables() if v.name == 'dense1/kernel:0'][0]
# weights2 = [v for v in tf.trainable_variables() if v.name == 'dense2/kernel:0'][0]

weights1 = sess.run(weights1)
# weights2 = sess.run(weights2)

for step in range(1000):
    batch_x, batch_y = mnist.train.next_batch(100)
    sess.run(train, feed_dict={X:batch_x, y_true:batch_y})

    if step % 50 ==0:
        weights2 = [v for v in tf.trainable_variables() if v.name == 'dense1/kernel:0'][0]
        print("weights = ", sess.run(weights2[0]))
        correct_prediction = tf.equal(tf.argmax(output,1), tf.argmax(y_true,1))
        acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        print("acc = {}\n".format(sess.run(acc, feed_dict={X:mnist.test.images, y_true:mnist.test.labels})))




weights_after1 = [v for v in tf.trainable_variables() if v.name == 'dense1/kernel:0'][0]
# weights_after2 = [v for v in tf.trainable_variables() if v.name == 'dense1/kernel:0'][0]

weights_after1 = sess.run(weights_after1)
# weights_after2 = sess.run(weights_after2)

print("Weights same ? =",(weights1 == weights_after1).all())

sess.close()

如何在定制的Tensorflow优化器中更新权重？

我的问题

我的完整代码

0 个答案: