Question

我正在尝试实现图像字幕模型。它以图像和文本作为输入。

我的模特

unit_size = 256
EncoderDense = Dense(unit_size, use_bias=False, name = 'dense_img')
EmbeddingLayer = Embedding(vocab_size2, embedding_dim, mask_zero = True, name = 'emb_text')
LSTMLayer = LSTM(unit_size, return_state = True, name = 'lstm')
SoftmaxLayer = Dense(vocab_size2, activation='softmax', name = 'time_distributed_softmax')
BatchNormLayer = BatchNormalization(name='batch_normalization_img')

# Image embedding
inputs1 = Input(shape=(OUTPUT_DIM,))
X_img1 = EncoderDense(inputs1)
X_img = Reshape((-1,256), input_shape=(1,256))(X_img1)
X_img2 = BatchNormLayer(X_img1)
X_img = Lambda(lambda x : K.expand_dims(x, axis=1))(X_img2)

# Text embedding
inputs2 = Input(shape=(max_length,))
X_text = EmbeddingLayer(inputs2)

# Initial States
a0 = Input(shape=(unit_size,))
c0 = Input(shape=(unit_size,))

a, _, c = LSTMLayer(X_img, initial_state=[a0, c0])

x = X_text

outputs = []
for i in range(max_length):
    
    a, _, c = LSTM(256, return_state = True)(x, initial_state=[a, c])
    output = SoftmaxLayer(a)
    outputs.append(output)
    x = Lambda(lambda x : K.expand_dims(K.argmax(x)))(output)
    x = EmbeddingLayer(x)

NICModel =  Model(inputs=[inputs1, inputs2, a0, c0], outputs=outputs, name='NIC_greedy_inference_v2')

我的生成器是：

def batch_generator(batch_size, max_len, photos, descriptions):
    N = len(encoding_train)
    i = 0
    img_features = []
    img = []
    X_text = []
    Y_text = []
    while True:
        for key, desc_list in descriptions.items():
            i += 1
            photo = photos[key]
            # Each photo has descriptions
            for desc in desc_list:
                # Convert each word into a list of sequences.
                seq = [wordtoidx2[word] for word in desc.split(' ') if word in wordtoidx2]  # wordtoidx2 or wordtoidx
                in_seq = seq[:len(seq) - 1] + [0] * (max_length - len(seq) + 1)
                out_seq = seq[1:] + [0] * (max_length - len(seq) + 1)
                img_features.append(photo)
                X_text.append(in_seq)
                Y_text.append(out_seq)
            if i == batch_size:
                img = np.array(img_features)
                X_text_mat = np.array(X_text)
                t = X_text_mat.shape[0]
                print(t)
                Y_text_mat = to_categorical(Y_text, vocab_size2)
                yield (
                    [img, X_text_mat, np.zeros([X_text_mat.shape[0], 256]),
                     np.zeros([X_text_mat.shape[0], 256])],
                    Y_text_mat)
                i = 0
                img_features = []
                img = []
                X_text = []
                Y_text = []

当我尝试拟合模型时：

NICGenerator = batch_generator(number_pics_per_batch, max_length, encoding_train, train_descriptions)
      NICModel.fit_generator(NICGenerator, epochs=1, steps_per_epoch=steps, verbose=2, initial_epoch = 0)

我遇到错误：

WARNING:tensorflow:Model was constructed with shape (None, 2048) for input Tensor("input_70:0", shape=(None, 2048), dtype=float32), but it was called on an input with incompatible shape (2, 24).
ValueError: Input 0 of layer dense_img is incompatible with the layer: expected axis -1 of input shape to have value 2048 but received input with shape [2, 24]

生成器，该模型通常来自https://github.com/soloist97/Show-And-Tell-Keras

我不确定此代码是否也有效。最后我需要提到的是模型与链接几乎相同，但是当我绘制模型时，绘制的模型确实不同。

ValueError：图层deny_img的输入0与图层不兼容

0 个答案: