我的体系结构是使用keras生成器的标准seq-to-seq编码器解码器,因此我不必一次将TD都加载到内存中。如果我仅更改对self.model的编译和拟合调用,这将在出现错误的同一虚拟环境中完美运行(也就是说,使用tensorflow-gpu),也就是说,一切都可以在单个GPU上正常运行。但是,使用多个GPU时,其中的某些部分被展平或错误地重新组合,从而导致形状不匹配。这是错误: [![在此处输入图片描述] [1]] [1]
----编辑----我能够使用一个最新版本的keras + tensorflow-gpu + cuda9.2在单个文件中使用以下代码来轻松重现此错误:
import tensorflow as tf
import numpy as np
from keras.utils import multi_gpu_model, Sequence
from keras.layers import Input, LSTM, Dense, TimeDistributed, Embedding
from keras.models import Model
BATCH_SIZE = 4
class trivial_Sequence(Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x = np.zeros((batch_size*4, 64))
self.y = np.zeros((batch_size*4, 64, 1))
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.x)/float(self.batch_size)))
def __getitem__(self, idx):
batch_x = self.x[idx*self.batch_size:(idx+1)*self.batch_size]
batch_y = self.y[idx*self.batch_size:(idx+1)*self.batch_size]
return batch_x, batch_y
def error_train():
#instantiate components
td = trivial_Sequence(None, None, BATCH_SIZE)
input = Input(shape=(None,), dtype='int32')
emb = Embedding(output_dim=10, input_dim = 64, input_length=None)
encode = LSTM(10, return_sequences=True, return_state = True)
project_up = Dense(units=20, activation='softmax')
#build network
temp = emb(input)
temp, _, _ = encode(temp)
output = TimeDistributed(project_up)(temp)
model = Model(inputs = input, outputs = output)
parallel_model = multi_gpu_model(model, gpus=4)
parallel_model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_accuracy'])
parallel_model.fit_generator(td, epochs=1, verbose=1)
#run it
error_train()