Keras模型不能并行化到多个GPU

时间:2017-06-15 04:16:06

标签: python keras multi-gpu

我正在尝试制作一个VAE来编码电影名称,然后在8核GPU上进行训练。该模型在单个GPU上按预期编译和拟合,但在我尝试在多个GPU上运行时会中断。以下是自动编码器的基本代码:

from keras.layers import Input, GRU, RepeatVector, Conv1D, Dense, TimeDistributed, Dropout, MaxPooling1D
from keras.models import Model
from keras.utils import to_categorical, plot_model
from keras.callbacks import ModelCheckpoint
import numpy as np
from keras import backend as K
from keras import metrics
from keras.layers import Lambda, Flatten, Layer
from keras import losses
import tensorflow as tf
import random

# Open file with 20k movie names from imdb
movies = open('/home/ubuntu/MovieNames/data/movies.dat')

data = []

# read data
for line in movies:
    data += [line.split("\t")]
names = [x[1] for x in data]

# get rid of the header
movie_names = names[1:]


chars = list('abcdefghijklmnopqrstuvwxyz ') + ['<END>', '<NULL>']
indices_for_chars = {c: i for i, c in enumerate(chars)}

NAME_MAX_LEN = 35 # include the <END> char

def name_to_vec(name, maxlen=NAME_MAX_LEN):
    name_lowercase = name.lower()
    v = np.zeros(maxlen, dtype=int)
    null_idx = indices_for_chars['<NULL>']
    v.fill(null_idx)
    # ignore cases
    for i, c in enumerate(name_lowercase):
        if i >= maxlen: break
        n = indices_for_chars.get(c, null_idx)
        v[i] = n
    v[min(len(name_lowercase), maxlen-1)] = indices_for_chars['<END>']
    return v

# convert to Keras-compatible form
names = np.array([to_categorical(name_to_vec(name),num_classes=len(chars)) for name in movie_names])

# Global parameters
NAME_LENGTH = names.shape[1]
ALPHABET = names.shape[2]
latent_dim = 10 * 8
intermediate_dim = 24 * 8
batch_size = 100 * 8
epochs = 20 
epsilon_std = 0.01

i = Input(shape=(NAME_LENGTH, ALPHABET))
x = Conv1D(256, 9)(i)
x = Dropout(0.2)(x) # o
x = Conv1D(256, 7)(x)
x = MaxPooling1D(pool_length=3)(x)
x = Dropout(0.2)(x)
x = Conv1D(256, 3)(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(intermediate_dim, activation='relu')(x)
x = Dropout(0.2)(x)
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim),
                              mean=0., stddev=epsilon_std)
    return z_mean + K.exp(z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

h = Dense(intermediate_dim, activation='relu')(z)
h = RepeatVector(NAME_LENGTH)(h)
h = GRU(256, return_sequences=True)(h)
h = Dropout(0.2)(h)
h = GRU(256, return_sequences=True)(h)
h = TimeDistributed(Dense(ALPHABET, activation='softmax'), name='decoded_mean')(h)

autoencoder = Model(i, h)

def vae_objective(y_true, y_pred):
    recon = K.sum(K.categorical_crossentropy(y_pred,y_true),axis=1)
    kl = 0.5 * K.sum(K.exp(z_log_var) + K.square(z_mean) - 1. - z_log_var,axis=1)
    return recon + kl

然后我使用Keras multi-GPU tool来并行化代码:

from keras import backend as K
from keras.models import Model
from keras.layers import Input
from keras.layers.core import Lambda
from keras.layers.merge import Concatenate

def slice_batch(x, n_gpus, part):
    """
    Divide the input batch into [n_gpus] slices, and obtain slice no. [part].
    i.e. if len(x)=10, then slice_batch(x, 2, 1) will return x[5:].
    """
    sh = K.shape(x)
    L = sh[0] / n_gpus
    if part == n_gpus - 1:
        return x[part*L:]
    return x[part*L:(part+1)*L]


def to_multi_gpu(model, n_gpus=2):
    """Given a keras [model], return an equivalent model which parallelizes
    the computation over [n_gpus] GPUs.

    Each GPU gets a slice of the input batch, applies the model on that slice
    and later the outputs of the models are concatenated to a single tensor, 
    hence the user sees a model that behaves the same as the original.
    """
    with tf.device('/cpu:0'):
        x = Input(model.input_shape[1:], name=model.input_names[0])

    towers = []
    for g in range(n_gpus):
        with tf.device('/gpu:' + str(g)):
            slice_g = Lambda(slice_batch, lambda shape: shape, arguments={'n_gpus':n_gpus, 'part':g})(x)
            towers.append(model(slice_g))

    with tf.device('/cpu:0'):
        merged = Concatenate(axis=0)(towers)

    return Model(inputs=[x], outputs=[merged])

如果适合它,那就是我遇到问题的时候:

model = to_multi_gpu(autoencoder, n_gpus=8)
model.compile(loss=vae_objective, optimizer='adam', metrics=["accuracy"])
model.fit(names[:8000], names[:8000], batch_size=batch_size)

给了我以下错误:

InvalidArgumentError: You must feed a value for placeholder tensor 'input_4' with dtype float
     [[Node: input_4 = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

请注意,所有参数都可以被GPU的数量整除,因此我不希望这是问题所在。

1 个答案:

答案 0 :(得分:0)

使用

model = to_multi_gpu(autoencoder, n_gpus=8)
model.compile(loss=vae_objective, optimizer='adam', metrics=["accuracy"])
model.fit(names[:8000], names[:8000], batch_size=batch_size*8)

即使用batch_size编码VAE,并使用batch_size * gpus运行

确保样本大小可以除以batch_size * gpus