我想将Transformer网络用于分布式培训。我的代码基于网络和损失函数的Tensorflow's official implementation。以下是取自here的损失函数的实现:
def _pad_tensors_to_same_length(x, y):
"""Pad x and y so that the results have the same length (second dimension)."""
with tf.name_scope("pad_to_same_length"):
x_length = tf.shape(x)[1]
y_length = tf.shape(y)[1]
max_length = tf.maximum(x_length, y_length)
x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
return x, y
def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
"""Calculate cross entropy loss while ignoring padding.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch_size, length_labels]
smoothing: Label smoothing constant, used to determine the on and off values
vocab_size: int size of the vocabulary
Returns:
Returns the cross entropy loss and weight tensors: float32 tensors with
shape [batch_size, max(length_logits, length_labels)]
"""
with tf.name_scope("loss"):
logits, labels = _pad_tensors_to_same_length(logits, labels)
# Calculate smoothing cross entropy
with tf.name_scope("smoothing_cross_entropy"):
confidence = 1.0 - smoothing
low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
soft_targets = tf.one_hot(
tf.cast(labels, tf.int32),
depth=vocab_size,
on_value=confidence,
off_value=low_confidence)
xentropy = tf.nn.softmax_cross_entropy_with_logits(
logits=logits, labels=soft_targets)
# Calculate the best (lowest) possible value of cross entropy, and
# subtract from the cross entropy loss.
normalizing_constant = -(
confidence * tf.math.log(confidence) +
tf.cast(vocab_size - 1, tf.float32) * low_confidence *
tf.math.log(low_confidence + 1e-20))
xentropy -= normalizing_constant
weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
return xentropy * weights, weights
def transformer_loss(logits, labels, smoothing, vocab_size):
"""Calculates total loss containing cross entropy with padding ignored.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch_size, length_labels]
smoothing: Label smoothing constant, used to determine the on and off values
vocab_size: int size of the vocabulary
Returns:
A scalar float tensor for loss.
"""
xentropy, weights = padded_cross_entropy_loss(logits, labels, smoothing,
vocab_size)
return tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
根据official guideline进行分布式训练,当使用MirroredStrategy时,对每个示例在每个副本上计算的损失应除以GLOBAL_BATCH_SIZE而不是副本BATCH_SIZE,因为所有副本的梯度均为总结。
使用以下代码固定BATCH_SIZE时,这很容易做到:
def transformer_loss(logits, labels, smoothing, vocab_size):
"""Calculates total loss containing cross entropy with padding ignored.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch_size, length_labels]
smoothing: Label smoothing constant, used to determine the on and off values
vocab_size: int size of the vocabulary
Returns:
A scalar float tensor for loss.
"""
xentropy, weights = padded_cross_entropy_loss(logits, labels, smoothing,
vocab_size)
xentropy = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1)
return tf.reduce_sum(xentropy) * (1.0 / GLOBAL_BATCH_SIZE)
我的问题是,在使用bucket_by_sequence_length函数对句子进行批处理以将具有相似长度且不超过批处理中允许的最大令牌数的句子存储在一起时,如何执行此操作。以这种方式进行批处理时,对于通过数据集进行的每次提取,最终将具有不同的全局批处理大小。通过压缩,代码基于Tensor2Tensor实现,可在以下链接中找到:
我的训练步骤如下:
with strategy.scope():
def train_step(inputs):
inp, tar = inputs
tar_real = tar[:, 1:]
with tf.GradientTape() as tape:
predictions = model([inp, tar_real], True)
loss = transformer_loss(predictions, tar_real, params['label_smoothing'], vocab_size)
gradients = tape.gradient(loss, model.variables)
update_vars = optimizer.apply_gradients(zip(gradients, model.variables))
update_loss = train_loss.update_state(loss)
update_acc = train_accuracy.update_state(tf.reduce_mean(padded_accuracy(predictions, tar_real)[0]))
with tf.control_dependencies([update_vars, update_loss, update_acc]):
return tf.identity(loss)
dist_train = strategy.reduce(tf.distribute.ReduceOp.MEAN, strategy.experimental_run(train_step, train_iterator))
有什么想法吗?