我正在编写一个可视问题答案神经网络。
模型:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model, Sequential
# img shape
img_h = 320
img_w = 480
# ----------------------
EMBEDDING_SIZE = 128
# Define CNN for Image Input
vision_model = Sequential()
vision_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(img_h, img_w, 3))) #Probabile errore di dimensioni qui
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Flatten())
image_input = Input(shape=(img_h, img_w, 3))
encoded_image = vision_model(image_input)
# Define RNN for language input
question_input = Input(shape=[41], dtype='int32')
embedded_question = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=EMBEDDING_SIZE, input_length=41)(question_input)
encoded_question = LSTM(128)(embedded_question)
# Combine CNN and RNN to create the final model
merged = tf.keras.layers.concatenate([encoded_question, encoded_image])
output = Dense(13, activation='softmax')(merged)
vqa_model = Model(inputs=[image_input, question_input], outputs=output)
数据生成器:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
img_directory= os.path.join(dataset_dir, "train")
import cv2
thisdict = {
'0': 0,
'1': 1,
'10': 2,
'2': 3,
'3': 4,
'4': 5,
'5': 6,
'6': 7,
'7': 8,
'8': 9,
'9': 10,
'no': 11,
'yes': 12
}
def get_image(question):
image_name = question.get("image_filename")
img = cv2.imread(os.path.join(img_directory, image_name))
return(img)
def get_question(question):
question_text = question.get("question")
return(question_text)
def question_to_token(question):
question_list = list()
question_list.append(question)
question_token_list = tokenizer.texts_to_sequences(question_list)
return(question_token_list[0])
def padding(question_token):
# Pad to max question length
question_token_list = list()
question_token_list.append(question_token)
question_padded_list = pad_sequences(question_token_list, maxlen=max_question_length, padding='post')
return(question_padded_list[0])
def preprocess_question(question):
question_tokenized = question_to_token(question)
question_padded = padding(question_tokenized)
return(question_padded)
def get_output(question):
question_answer = question.get("answer")
x = thisdict[question_answer]
return(x)
def preprocess_input(image):
"""--- Rescale Image
--- Rotate Image
--- Resize Image
--- Flip Image
--- PCA etc. """
return(image)
def image_generator(batch_size = 64):
while True:
# Select files (paths/indices) for the batch
batch_questions = np.random.choice(a = train_data,
size = batch_size)
batch_input = []
batch_output = []
batch_input_image=[]
batch_input_question=[]
# Read in each input, perform preprocessing and get labels
for input_element in batch_questions:
input_question = get_question(input_element)
input_question_preprocessed = preprocess_question(input_question)
input_image = get_image(input_element)
output = get_output(input_element)
input_question_preprocessed
batch_input_image += [input_image]
batch_input_question += [input_question_preprocessed]
batch_output += [ output ]
# Return a tuple of (input,output) to feed the network
batch_x_image = np.array( batch_input_image )
batch_x_question = np.array( batch_input_question )
batch_y = np.array( batch_output )
yield( [batch_x_image,batch_x_question], batch_y )
基本上,我正在生成[图像,问题]的“对”,然后将它们映射为正确的答案。
使用Tokenizer
对问题进行标记(这样我就获得了一个整数列表,每个整数都映射了一个单词在问题中),并且在填充问题后得到的固定输入长度为 41 (41是最长的问题的长度)。
在测试时,我正在使用:
out_softmax = vqa_model.predict(x=(image,question_string))
但是我一直收到问题标题中提到的错误。
我用question_string
测试了question_string.shape
数据,得到了(41,)
,它似乎与我的网络期望的输入大小一致。
我还打印了question_string
的内容,
[ 3 4 25 16 39 7 66 2 1 5 47 6 1 17 36 51 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
(再次)与我的期望保持一致。
我还检查了上一个问题:Error when checking input: expected dense_input to have shape (21,) but got array with shape (1,),或多或少,我理解了问题所在,但是我找不到解决方案(添加额外的backets对我不起作用)。 / p>
我该如何解决这个问题?并且为什么我仍然得到它,即使(如上所示)我的数组大小实际上是(41,)
函数所要求的predict
呢?