Question

我正在调整Bert，并且遇到OOM问题。我听说解决此问题的一个好方法是使用“渐变累加”。下面是我的optimization.py（包括渐变累积）

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re
import tensorflow as tf



from tensorflow.python.training import optimizer
from tensorflow.python.framework import ops





def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  """Creates an optimizer training op."""
  global_step = tf.train.get_or_create_global_step()

  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  learning_rate = tf.train.polynomial_decay(
      learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)

  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  # learning rate will be `global_step/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

  # It is recommended that you use this optimizer for fine tuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  optimizer = MultistepAdamWeightDecayOptimizer(
      learning_rate=learning_rate,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999,
      epsilon=1e-6,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

  if use_tpu:
    optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

  tvars = tf.trainable_variables()
  grads = tf.gradients(loss, tvars)

  # This is how the model was pre-trained.
  (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

  train_op = optimizer.apply_gradients(
      zip(grads, tvars), global_step=global_step)

  # Normally the global step update is done inside of `apply_gradients`.
  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
  # a different optimizer, you should probably take this line out.
  new_global_step = global_step + 1
  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  return train_op


class MultistepAdamWeightDecayOptimizer(optimizer.Optimizer):
  """A basic Adam optimizer that includes "correct" L2 weight decay."""

  def __init__(self,
               learning_rate,
               weight_decay_rate=0.0,
               beta_1=0.9,
               beta_2=0.999,
               n = 1,
               epsilon=1e-6,
               exclude_from_weight_decay=None,
               name="MultistepAdamWeightDecayOptimizer"):
    """Constructs a AdamWeightDecayOptimizer."""
    super(MultistepAdamWeightDecayOptimizer, self).__init__(False, name)

    self.learning_rate = learning_rate
    self.weight_decay_rate = weight_decay_rate
    self.beta_1 = beta_1
    self.beta_2 = beta_2
    self.epsilon = epsilon
    self._n = n

    self.exclude_from_weight_decay = exclude_from_weight_decay

    self._n_t = None


  def _prepare(self):
    super(MultistepAdamWeightDecayOptimizer, self)._prepare()
    self._n_t=tf.convert_to_tensor(self._n, name="n")

  def _create_slots(self,var_list):

    super(MultistepAdamWeightDecayOptimizer, self)._create_slots(var_list)
    first_var = min(var_list, key=lambda x: x.name)
    self._create_non_slot_variable(initial_value=0 if self._n == 1 else 1,
                                   name="iter",
                                   colocate_with=first_var)

    for v in var_list:
      self._zeros_slot(v,"grad_acc",self._name)

  def _get_iter_variable(self):
    if tf.contrib.eager.in_eager_mode():
      graph = None
    else:
      graph = tf.get_default_graph()
    return self._get_non_slot_variable("iter", graph=graph)


  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """See base class."""
    update_ops = []

    var_list = [v for g, v in grads_and_vars if g is not None]


    with ops.init_scope():
      self._create_slots(var_list)
    self._prepare()



    for(grad, param) in grads_and_vars:
      if grad is None or param is None:
        continue

      grad_acc = self.get_slot(param, "grad_acc")
      param_name = self._get_variable_name(params.name)


      m = tf.get_variable(name=param_name + "/adam_m", shape=param.shape.as_list(),
          dtype=tf.float32,trainable=False, initializer=tf.zeros_initializer())

      v = tf.get_variable(name =param_name + "/adam_v", shape=param.sahpe.as_list(),
          dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer())




      def _apply_adam(grad_acc, grad, param, m, v):
        total_grad = (grad_acc + grad) / tf.cast(self._n_t, grad.dtype)



        next_m = (
          tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, total_grad))
        next_v = (
          tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
                                                    tf.square(total_grad)))

        update = next_m / (tf.sqrt(next_v) + self.epsilon)
        if self._do_use_weight_decay(param_name):
          update += self.weight_decay_rate * param
        update_with_lr =self.learning_rate * update
        next_param = param - update_with_lr
        adam_op = tf.group(param.assign(next_param), m.assign(next_m),
                       v.assign(next_v))
        with tf.control_dependencies([adam_op]):
           grad_acc_to_zero_op = grad_acc.assign(tf.zero_like(grad_acc), use_locking=self._use_locking)
        return tf.group(adam_op, grad_acc_to_zero_op)

      def _accumulate_gradient(grad_acc, grad):
         assign_up = tf.assign_add(grad_acc, grad, use_locking=self._use_locking)
         return tf.group(assign_op)

      update_op = tf.cond(tf.equal(self._get_iter_variable(),0),
                     lambda: _apply_adam(grad_acc, grad, param,m, v),
                     lambda: _accumulate_gradient(grad_acc, grad))
      update_ops.append(update_op)

    apply_updates = self._finish(update_ops, name_scope=name)
    return apply_updates

  def _finish(self, update_ops, name_scope):
    iter_=self._get_iter_variable()
    with tf.control_dependencies(update_ops):
      with tf.colocate_with(iter_):
        update_iter = iter_.assign(tf.mod(iter_+1, self._n_t),
                                      use_locking=self._use_locking)

    return tf.group(
           *update_ops + [update_iter], name=name_scope)

  def _do_use_weight_decay(self, param_name):
    """Whether to use L2 weight decay for `param_name`."""
    if not self.weight_decay_rate:
      return False
    if self.exclude_from_weight_decay:
      for r in self.exclude_from_weight_decay:
        if re.search(r, param_name) is not None:
          return False
    return True

  def _get_variable_name(self, param_name):
    """Get the variable name from the tensor name."""
    m = re.match("^(.*):\\d+$", param_name)
    if m is not None:
      param_name = m.group(1)
    return param_name

使用了optimization.py之后，我可以使用大批处理。但是损失并没有减少，经过300个步骤（我得到了550000个训练数据，批处理大小64，迭代1000和时期20），它说：训练循环标记为完成并停止。我不确定是什么问题，请您帮帮我吗？谢谢。

BERT微调中使用梯度累加的方法

0 个答案: