保存和加载keras子类模型

时间:2019-10-30 16:17:19

标签: python tensorflow keras

我正在尝试从TF Tutorial(关于图像字幕:https://www.tensorflow.org/tutorials/text/image_captioning)中保存和加载CNN编码器和RNN解码器。由于这些是Keras模型的子类,而不是功能模型或顺序模型,因此我无法直接使用model.savemodel.load

相反,我不得不使用model.save_weightsmodel.load_weights。问题是model.load_weights仅在model.build之后才能调用,并且model.build要求input_shape参数必须是元组而不是元组列表。但是,对于我们的RNN解码器,我们有多个输入。 Keras文档指定无法通过多个输入来调用model.build

还有其他加载模型的方法吗?

最终,我希望有一个较小的python脚本,它可以加载模型权重并进行推断。该脚本不需要培训。

协作:https://colab.research.google.com/drive/12YtCH2X0pwIBBXPW0TXmeA520MyVv9AF

1 个答案:

答案 0 :(得分:1)

这是我设法解决此问题的方法。不是一个很好的解决方案,但可以! 首先,将每个权重矩阵保存在.npy文件中:

for i, layer in enumerate(encoder.layers):
  print("Layer %s" %i, layer.name)
  for j, w in enumerate(layer.weights):
     print(w.shape)
     np.save("encoder_layer_weights/layer_%s_%s_weights_%s.npy" %(i, layer.name, j), w.numpy())


for i, layer in enumerate(decoder.layers):
  print("Layer %s" %i, layer.name)
  for j, w in enumerate(layer.weights):
     print(w.shape)
     np.save("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(i, layer.name, j), w.numpy())

然后您重新创建子类化模型,但是这次您对每个图层中的每个权重使用initializers。必须谨慎进行,因为如果形状不匹配,则您的模型将无法编译。

class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        C = tf.keras.initializers.Constant
        w1, w2 = [np.load("encoder_layer_weights/layer_%s_%s_weights_%s.npy" %(0, "dense", j)) \
                                      for j in range(2)]
        self.fc = tf.keras.layers.Dense(embedding_dim, kernel_initializer=C(w1), bias_initializer=C(w2))

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x


class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        C = tf.keras.initializers.Constant
        w1, w2, w3, w4, w5, w6 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(4, "bahdanau_attention", j)) \
                                  for j in range(6)]
        self.W1 = tf.keras.layers.Dense(units, kernel_initializer=C(w1), bias_initializer=C(w2))
        self.W2 = tf.keras.layers.Dense(units, kernel_initializer=C(w3), bias_initializer=C(w4))
        self.V = tf.keras.layers.Dense(1, kernel_initializer=C(w5), bias_initializer=C(w6))

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        C = tf.keras.initializers.Constant
        w_emb = np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(0, "embedding", 0))
        w_gru_1, w_gru_2, w_gru_3 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(1, "gru", j)) for j in range(3)]
        w1, w2 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(2, "dense_1", j)) for j in range(2)]
        w3, w4 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(3, "dense_2", j)) for j in range(2)]

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, embeddings_initializer=C(w_emb))
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       kernel_initializer=C(w_gru_1),
                                       recurrent_initializer=C(w_gru_2),
                                       bias_initializer=C(w_gru_3)
                                       )
        self.fc1 = tf.keras.layers.Dense(self.units, kernel_initializer=C(w1), bias_initializer=C(w2))
        self.fc2 = tf.keras.layers.Dense(vocab_size, kernel_initializer=C(w3), bias_initializer=C(w4))

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

最后,您将像往常一样实例化EncoderDecoder类:

encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)