每次运行tape.gradients部分时,内存使用量都会翻倍。我认为它与时间分布层有关...?有什么想法吗?
###### create model
inputs = tf.keras.Input(shape=(6, *data_loader_train[0][0][0].shape), name='img') ## (108, 192, 3)
x = layers.TimeDistributed(layers.Conv2D(16, 3, activation='relu'))(inputs)
x = layers.TimeDistributed(layers.Conv2D(16, 3, activation='relu'))(x)
block_1_output = layers.TimeDistributed(layers.MaxPooling2D(2))(x)
x = layers.TimeDistributed(layers.Conv2D(16, 3, activation='relu', padding='same'))(block_1_output)
block_3_output = layers.add([x, block_1_output])
block_3_output = layers.TimeDistributed(layers.MaxPooling2D(2))(block_3_output)
x = layers.TimeDistributed(layers.Conv2D(16, 3, activation='relu'))(block_3_output)
x = layers.TimeDistributed(layers.GlobalAveragePooling2D())(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation='relu')(x)
x = layers.Dense(1)(x)
counts = tf.keras.activations.softplus(x)
model = tf.keras.Model(inputs, counts, name='toy_resnet')
model.summary()
### run model
####### running this part doubles memory every two times ##########
for x_ in batch(np.random.uniform(size=(100,6,108,192,3)).astype(np.float32), 10):
with tf.GradientTape() as tape:
count_ = tf.reduce_sum(model(x_))