Question

我编写了一个自定义TF模型（tensorflow 2.3.1），并使用.fit方法对其进行了训练。当我称其为fit时，尽管该模型似乎每个时期训练两次。真奇怪甚至更奇怪的是，它只会在我使用组织的超级计算机时发生，而不会在本地计算机上发生。

这是我的模型的代码：

class LSTMModel(tf.keras.Model):
    def __init__(self, hidden_size, gradient_correction=False, lamb=1):
        """
        :param hidden_size: [int] the number of hidden units
        """
        super().__init__()
        self.gradient_correction = gradient_correction
        self.lamb = lamb
        self.lstm_layer = layers.LSTM(hidden_size, return_sequences=True,
                                      name='lstm_shared')
        self.dense_main = layers.Dense(1, name='dense_main')
        self.dense_aux = layers.Dense(1, name='dense_aux')

    @tf.function
    def call(self, inputs, **kwargs):
        x = self.lstm_layer(inputs)
        main_prediction = self.dense_main(x)
        aux_prediction = self.dense_aux(x)
        return tf.concat([main_prediction, aux_prediction], axis=2)

    @tf.function
    def train_step(self, data):
        x, y = data

        # If I don't do one forward pass before starting the gradient tape,
        # the thing hangs
        _ = self(x)
        with tf.GradientTape(persistent=True) as tape:
            y_pred = self(x, training=True)  # forward pass

            loss_main = rmse_masked_one_var(y, y_pred, 0)
            loss_aux = rmse_masked_one_var(y, y_pred, 1)

        trainable_vars = self.trainable_variables

        main_out_vars = get_variables(trainable_vars, 'dense_main')
        aux_out_vars = get_variables(trainable_vars, 'dense_aux')
        shared_vars = get_variables(trainable_vars, 'lstm_shared')

        # get gradients
        gradient_main_out = tape.gradient(loss_main, main_out_vars)
        gradient_aux_out = tape.gradient(loss_aux, aux_out_vars)
        gradient_shared_main = tape.gradient(loss_main, shared_vars)
        gradient_shared_aux = tape.gradient(loss_aux, shared_vars)

        if self.gradient_correction:
            # adjust auxiliary gradient
            gradient_shared_aux = adjust_gradient_list(gradient_shared_main,
                                                       gradient_shared_aux)
        combined_gradient = combine_gradients_list(gradient_shared_main,
                                                   gradient_shared_aux,
                                                   lamb=self.lamb)

        # apply gradients
        self.optimizer.apply_gradients(zip(gradient_main_out, main_out_vars))
        self.optimizer.apply_gradients(zip(gradient_aux_out, aux_out_vars))
        self.optimizer.apply_gradients(zip(combined_gradient, shared_vars))
        return {'loss_main': loss_main, 'loss_aux': loss_aux}

这是我执行.fit()时的输出：

Epoch 1/10
Epoch 1/10
24/24 [==============================] - 2s 86ms/step - loss_main: 0.6247 - loss_aux: 0.2181
24/24 [==============================] - 2s 87ms/step - loss_main: 0.6202 - loss_aux: 0.2128
Epoch 2/10
Epoch 2/10
24/24 [==============================] - 2s 85ms/step - loss_main: 0.6286 - loss_aux: 0.2115
24/24 [==============================] - 2s 86ms/step - loss_main: 0.6286 - loss_aux: 0.2175
Epoch 3/10
Epoch 3/10
24/24 [==============================] - 2s 86ms/step - loss_main: 0.6352 - loss_aux: 0.2080
24/24 [==============================] - 2s 86ms/step - loss_main: 0.6344 - loss_aux: 0.2137
Epoch 4/10
Epoch 4/10
24/24 [==============================] - 2s 85ms/step - loss_main: 0.6373 - loss_aux: 0.2057
24/24 [==============================] - 2s 85ms/step - loss_main: 0.6314 - loss_aux: 0.2095
Epoch 5/10
Epoch 5/10
24/24 [==============================] - 2s 83ms/step - loss_main: 0.6382 - loss_aux: 0.2023
24/24 [==============================] - 2s 84ms/step - loss_main: 0.6373 - loss_aux: 0.2059

有什么想法为什么似乎每个时期要训练两次？

每个时期两次进行张量流训练

0 个答案: