我编写了一个自定义TF模型(tensorflow 2.3.1),并使用.fit
方法对其进行了训练。当我称其为fit时,尽管该模型似乎每个时期训练两次。真奇怪甚至更奇怪的是,它只会在我使用组织的超级计算机时发生,而不会在本地计算机上发生。
这是我的模型的代码:
class LSTMModel(tf.keras.Model):
def __init__(self, hidden_size, gradient_correction=False, lamb=1):
"""
:param hidden_size: [int] the number of hidden units
"""
super().__init__()
self.gradient_correction = gradient_correction
self.lamb = lamb
self.lstm_layer = layers.LSTM(hidden_size, return_sequences=True,
name='lstm_shared')
self.dense_main = layers.Dense(1, name='dense_main')
self.dense_aux = layers.Dense(1, name='dense_aux')
@tf.function
def call(self, inputs, **kwargs):
x = self.lstm_layer(inputs)
main_prediction = self.dense_main(x)
aux_prediction = self.dense_aux(x)
return tf.concat([main_prediction, aux_prediction], axis=2)
@tf.function
def train_step(self, data):
x, y = data
# If I don't do one forward pass before starting the gradient tape,
# the thing hangs
_ = self(x)
with tf.GradientTape(persistent=True) as tape:
y_pred = self(x, training=True) # forward pass
loss_main = rmse_masked_one_var(y, y_pred, 0)
loss_aux = rmse_masked_one_var(y, y_pred, 1)
trainable_vars = self.trainable_variables
main_out_vars = get_variables(trainable_vars, 'dense_main')
aux_out_vars = get_variables(trainable_vars, 'dense_aux')
shared_vars = get_variables(trainable_vars, 'lstm_shared')
# get gradients
gradient_main_out = tape.gradient(loss_main, main_out_vars)
gradient_aux_out = tape.gradient(loss_aux, aux_out_vars)
gradient_shared_main = tape.gradient(loss_main, shared_vars)
gradient_shared_aux = tape.gradient(loss_aux, shared_vars)
if self.gradient_correction:
# adjust auxiliary gradient
gradient_shared_aux = adjust_gradient_list(gradient_shared_main,
gradient_shared_aux)
combined_gradient = combine_gradients_list(gradient_shared_main,
gradient_shared_aux,
lamb=self.lamb)
# apply gradients
self.optimizer.apply_gradients(zip(gradient_main_out, main_out_vars))
self.optimizer.apply_gradients(zip(gradient_aux_out, aux_out_vars))
self.optimizer.apply_gradients(zip(combined_gradient, shared_vars))
return {'loss_main': loss_main, 'loss_aux': loss_aux}
这是我执行.fit()
时的输出:
Epoch 1/10
Epoch 1/10
24/24 [==============================] - 2s 86ms/step - loss_main: 0.6247 - loss_aux: 0.2181
24/24 [==============================] - 2s 87ms/step - loss_main: 0.6202 - loss_aux: 0.2128
Epoch 2/10
Epoch 2/10
24/24 [==============================] - 2s 85ms/step - loss_main: 0.6286 - loss_aux: 0.2115
24/24 [==============================] - 2s 86ms/step - loss_main: 0.6286 - loss_aux: 0.2175
Epoch 3/10
Epoch 3/10
24/24 [==============================] - 2s 86ms/step - loss_main: 0.6352 - loss_aux: 0.2080
24/24 [==============================] - 2s 86ms/step - loss_main: 0.6344 - loss_aux: 0.2137
Epoch 4/10
Epoch 4/10
24/24 [==============================] - 2s 85ms/step - loss_main: 0.6373 - loss_aux: 0.2057
24/24 [==============================] - 2s 85ms/step - loss_main: 0.6314 - loss_aux: 0.2095
Epoch 5/10
Epoch 5/10
24/24 [==============================] - 2s 83ms/step - loss_main: 0.6382 - loss_aux: 0.2023
24/24 [==============================] - 2s 84ms/step - loss_main: 0.6373 - loss_aux: 0.2059
有什么想法为什么似乎每个时期要训练两次?