我正在尝试使用TPU在Google Colab上为大学项目训练模型。我正在使用TensorFlow 1.15.0。现在,从TPU示例中可以理解,我正在将tf.keras.models.Model实例转换为具有适当分发策略的TPU兼容实例(下面的代码)。
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu=TPU_WORKER)
tf.contrib.distribute.initialize_tpu_system(resolver)
strategy = tf.contrib.distribute.TPUStrategy(resolver)
print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
随后是模型创建调用(下面的代码)
with strategy.scope():
model = define_generator()
adam = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5, beta2=0.999)
model.compile(optimizer=adam, loss='mean_absolute_error', metrics=['accuracy'])
model.summary()
model.fit(X_train, Y_train, steps_per_epoch=1451, epochs=64, batch_size=8, callbacks=[term])
define_generator()
函数如下:
# define an encoder block
def define_encoder_block(layer_in, n_filters, batchnorm=True):
# weight initialization
init = RandomNormal(stddev=0.02)
# add downsampling layer
g = tf.keras.layers.Conv2D(int(n_filters/2), (3,3), padding='same', kernel_initializer=init)(layer_in)
g = tf.keras.layers.Conv2D(int(n_filters/2), (3,3), strides=(2,2), padding='same', kernel_initializer=init)(g)
g = tf.keras.layers.Conv2D(n_filters, (3,3), padding='same', kernel_initializer=init)(g)
# conditionally add batch normalization
if batchnorm:
g = tf.keras.layers.BatchNormalization()(g, training=True)
# leaky relu activation
g = tf.keras.activations.elu(g)
return g
# define a decoder block
def decoder_block(layer_in, skip_in, n_filters, dropout=True):
# weight initialization
init = RandomNormal(stddev=0.02)
# add upsampling layer
g = tf.keras.layers.Conv2D(int(n_filters/2), (3,3), padding='same', kernel_initializer=init)(layer_in)
g = tf.keras.layers.Conv2D(int(n_filters/2), (3,3), padding='same', kernel_initializer=init)(layer_in)
g = tf.keras.layers.Conv2DTranspose(n_filters, (3,3), strides=(2,2), padding='same', kernel_initializer=init)(g)
# add batch normalization
g = tf.keras.layers.BatchNormalization()(g, training=True)
# conditionally add dropout
if dropout:
g = tf.keras.layers.Dropout(0.5)(g, training=True)
# merge with skip connection
g = tf.keras.layers.Concatenate()([g, skip_in])
# relu activation
g = tf.keras.activations.elu(g)
return g
# define complete model
def define_generator(image_shape=(256,256,3)):
# weight initialization
init = RandomNormal(stddev=0.02)
# image input
in_image = tf.keras.layers.Input(shape=image_shape)
# encoder model: C64-C128-C256-C512-C512-C512-C512-C512
e1 = define_encoder_block(in_image, 64, batchnorm=False)
e2 = define_encoder_block(e1, 128)
e3 = define_encoder_block(e2, 256)
e4 = define_encoder_block(e3, 512)
e5 = define_encoder_block(e4, 512)
e6 = define_encoder_block(e5, 512)
e7 = define_encoder_block(e6, 512)
# bottleneck, no batch norm and relu
b = Conv2D(512, (3,3), strides=(2,2), padding='same', kernel_initializer=init)(e7)
b = tf.keras.activations.elu(b)
# decoder model: CD512-CD1024-CD1024-C1024-C1024-C512-C256-C128
d1 = decoder_block(b, e7, 512)
d2 = decoder_block(d1, e6, 512)
d3 = decoder_block(d2, e5, 512)
d4 = decoder_block(d3, e4, 512, dropout=False)
d5 = decoder_block(d4, e3, 256, dropout=False)
d6 = decoder_block(d5, e2, 128, dropout=False)
d7 = decoder_block(d6, e1, 64, dropout=False)
# output
g = tf.keras.layers.Conv2DTranspose(3, (3,3), strides=(2,2), padding='same', kernel_initializer=init)(d7)
out_image = tf.keras.activations.tanh(g)
# define model
model = tf.keras.models.Model(in_image, out_image)
return model
但是,我收到了内部错误:无法序列化消息,该消息可以追溯到model.fit()
方法,我尝试在各处搜索解决方案,但找不到解决方案。有人可以帮帮我吗?
以下是指向我的Colab笔记本的链接,可以在其中找到完整的跟踪记录:
https://colab.research.google.com/drive/1bA1UlSMGuqH8Ph5PuLfslM2f71SaEtd-
答案 0 :(得分:0)
在最新版本中,对TPU上的Keras模型的支持已大大改善。我已经着手更新了TF 2.2的代码示例。大多数更改是简单的重命名,最大的更改是我使用tf.data.Dataset
设置了输入数据集。为了在TPU上获得最佳结果,请使用we always recommend using tf.data.Dataset而不是直接与model.fit
一起使用numpy数组。如果您已经将数据存储在numpy中,则可以使用tf.data.Dataset.from_tensor_slices((X_train, Y_train))
创建数据集,尽管使用TFRecords可能会获得更好的结果。我无权访问您的原始数据集,因此我继续使用随机张量代替。
这是更新的代码:
%tensorflow_version 2.x
import os
import tensorflow as tf
import numpy as np
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
# define an encoder block
def define_encoder_block(layer_in, n_filters, batchnorm=True):
# weight initialization
init = tf.keras.initializers.RandomNormal(stddev=0.02)
# add downsampling layer
g = tf.keras.layers.Conv2D(int(n_filters/2), (3,3), padding='same', kernel_initializer=init)(layer_in)
g = tf.keras.layers.Conv2D(int(n_filters/2), (3,3), strides=(2,2), padding='same', kernel_initializer=init)(g)
g = tf.keras.layers.Conv2D(n_filters, (3,3), padding='same', kernel_initializer=init)(g)
# conditionally add batch normalization
if batchnorm:
g = tf.keras.layers.BatchNormalization()(g, training=True)
# leaky relu activation
g = tf.keras.activations.elu(g)
return g
# define a decoder block
def decoder_block(layer_in, skip_in, n_filters, dropout=True):
# weight initialization
init = tf.keras.initializers.RandomNormal(stddev=0.02)
# add upsampling layer
g = tf.keras.layers.Conv2D(int(n_filters/2), (3,3), padding='same', kernel_initializer=init)(layer_in)
g = tf.keras.layers.Conv2D(int(n_filters/2), (3,3), padding='same', kernel_initializer=init)(layer_in)
g = tf.keras.layers.Conv2DTranspose(n_filters, (3,3), strides=(2,2), padding='same', kernel_initializer=init)(g)
# add batch normalization
g = tf.keras.layers.BatchNormalization()(g, training=True)
# conditionally add dropout
if dropout:
g = tf.keras.layers.Dropout(0.5)(g, training=True)
# merge with skip connection
g = tf.keras.layers.Concatenate()([g, skip_in])
# relu activation
g = tf.keras.activations.elu(g)
return g
# define complete model
def define_generator(image_shape=(256,256,3)):
# weight initialization
init = tf.keras.initializers.RandomNormal(stddev=0.02)
# image input
in_image = tf.keras.layers.Input(shape=image_shape)
# encoder model: C64-C128-C256-C512-C512-C512-C512-C512
e1 = define_encoder_block(in_image, 64, batchnorm=False)
e2 = define_encoder_block(e1, 128)
e3 = define_encoder_block(e2, 256)
e4 = define_encoder_block(e3, 512)
e5 = define_encoder_block(e4, 512)
e6 = define_encoder_block(e5, 512)
e7 = define_encoder_block(e6, 512)
# bottleneck, no batch norm and relu
b = tf.keras.layers.Conv2D(512, (3,3), strides=(2,2), padding='same', kernel_initializer=init)(e7)
b = tf.keras.activations.elu(b)
# decoder model: CD512-CD1024-CD1024-C1024-C1024-C512-C256-C128
d1 = decoder_block(b, e7, 512)
d2 = decoder_block(d1, e6, 512)
d3 = decoder_block(d2, e5, 512)
d4 = decoder_block(d3, e4, 512, dropout=False)
d5 = decoder_block(d4, e3, 256, dropout=False)
d6 = decoder_block(d5, e2, 128, dropout=False)
d7 = decoder_block(d6, e1, 64, dropout=False)
# output
g = tf.keras.layers.Conv2DTranspose(3, (3,3), strides=(2,2), padding='same', kernel_initializer=init)(d7)
out_image = tf.keras.activations.tanh(g)
# define model
model = tf.keras.models.Model(in_image, out_image)
return model
# Values from original notebook
# shape = (11612,256,256,3) # this caused my notebook to OOM since it's huge
shape = (256,256,256,3)
batch_size = 8
epochs = 64
# Create fake random dataset
X_train = np.random.rand(*shape)
Y_train = np.random.rand(*shape)
dataset = (tf.data.Dataset.from_tensor_slices((X_train, Y_train))
.repeat(epochs)
.batch(batch_size, drop_remainder=True)
.prefetch(16))
with strategy.scope():
model = define_generator()
adam = tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5, beta_2=0.999)
model.compile(optimizer=adam, loss='mean_absolute_error', metrics=['accuracy'])
model.summary()
model.fit(dataset)