我使用Tesla V100 GPU运行我的双向GRU编码器-解码器。我的训练数据为800,数据中包含可变令牌。它可以是771到25,672令牌。当我运行它时,我的内存消耗达到最大,然后由于OOM而崩溃。然后,我尝试将较小的数据(10个数据)运行到相同的模型,并且成功运行,但同时也会消耗最大的RAM。我已经试过在Tesla K80 12GB(Google Colaboratory)中使用相同的代码运行10个数据,而该数据仅消耗约3GB。我已经使用了allow_growth
,但是它不能解决错误。
以下是我拥有的软件和硬件的详细信息:
这是我模型的超参数:
这是我的模型代码
graph = tf.Graph()
with graph.as_default():
x = tf.placeholder(tf.float32, shape=[1, None, embedding_size])
y_label = tf.placeholder(tf.float32, shape=[1,max_summary_len, embedding_size])
initial_input = tf.placeholder(tf.float32, shape=(embedding_size,1))
Wa = tf.Variable(tf.random_normal([hidden_unit, hidden_unit],seed=seed, mean=mean, stddev=0.001))
v_a = tf.transpose(tf.zeros([alignment_unit,1],dtype=tf.float32))
Ua = tf.Variable(tf.random_normal([alignment_unit, 2*hidden_unit],seed=seed, mean=mean, stddev=0.001))
Wu = tf.Variable(tf.random_normal([hidden_unit, embedding_size],seed=seed, mean=mean, stddev=stddev))
Uu = tf.Variable(tf.random_normal([hidden_unit, hidden_unit],seed=seed, mean=mean, stddev=stddev))
Cu = tf.Variable(tf.random_normal([hidden_unit, 2*hidden_unit],seed=seed, mean=mean, stddev=stddev))
Wr = tf.Variable(tf.random_normal([hidden_unit, embedding_size],seed=seed, mean=mean, stddev=stddev))
Ur = tf.Variable(tf.random_normal([hidden_unit, hidden_unit],seed=seed, mean=mean, stddev=stddev))
Cr = tf.Variable(tf.random_normal([hidden_unit, 2*hidden_unit],seed=seed, mean=mean, stddev=stddev))
W = tf.Variable(tf.random_normal([hidden_unit, embedding_size],seed=seed, mean=mean, stddev=stddev))
U = tf.Variable(tf.random_normal([hidden_unit, hidden_unit],seed=seed, mean=mean, stddev=stddev))
C = tf.Variable(tf.random_normal([hidden_unit, 2*hidden_unit],seed=seed, mean=mean, stddev=stddev))
Ww_o = tf.Variable(tf.random_normal([embedding_size, embedding_size],seed=seed, mean=mean, stddev=stddev))
Wc_o = tf.Variable(tf.random_normal([embedding_size, 2*hidden_unit],seed=seed, mean=mean, stddev=stddev))
Ws_o = tf.Variable(tf.random_normal([embedding_size, hidden_unit],seed=seed, mean=mean, stddev=stddev))
Wo = tf.Variable(tf.random_normal([embedding_size, 1],seed=seed, mean=mean, stddev=stddev))
b_o = tf.zeros([embedding_size, 1])
# define model
"""__encoder___"""
encoder_LSTM = tf.keras.layers.CuDNNGRU(hidden_unit,return_sequences=True,return_state=True)
encoder_LSTM_rev=tf.keras.layers.CuDNNGRU(hidden_unit,return_state=True,return_sequences=True,go_backwards=True)
encoder_outputs, state_h = encoder_LSTM(x)
encoder_outputsR, state_hR = encoder_LSTM_rev(x)
state_hfinal=Add()([state_h,state_hR])
encoder_outputs_final = tf.concat([encoder_outputs,encoder_outputsR], axis=2)
"""__decoder___"""
initial_state = tf.zeros([hidden_unit,1], dtype=tf.float32)
arr = tf.reshape(initial_input, (1, embedding_size))
def decoderStep(arr, last_output,last_state, step):
prev_state_rep = tf.tile(last_state, (1, tf.shape(encoder_outputs_final)[1])) #something wrong #nilai state diulang sesuai panjang input
e = tf.matmul(v_a, tf.tanh(tf.add(tf.matmul(Wa,prev_state_rep), tf.matmul(Ua, tf.reshape(encoder_outputs_final,[2*hidden_unit,-1])))))
pembilang = tf.math.exp(e)
penyebut = tf.reduce_sum(pembilang, axis=1)
penyebut = tf.reshape(penyebut, [1,1])
penyebut = tf.tile(penyebut, (1, tf.shape(e)[1])) #nilai penyebut diulang sesuai panjang input
alphas = pembilang/penyebut
c = tf.reduce_sum(alphas*tf.reshape(encoder_outputs_final,[2*hidden_unit, -1]),axis=1)
c = tf.expand_dims(c,1)
u = tf.nn.sigmoid(tf.matmul(Wu, last_output) + tf.matmul(Uu, last_state) + tf.matmul(Cu, c))
r = tf.nn.sigmoid(tf.matmul(Wr, last_output) + tf.matmul(Ur, last_state) + tf.matmul(Cr, c))
s_ = tf.nn.tanh(tf.matmul(W, last_output) + tf.multiply(tf.matmul(U, r), tf.matmul(U, last_state)) + tf.matmul(C,c))
s = tf.multiply(1-u, last_state) + tf.multiply(u, s_)
o = tf.matmul(Ww_o, last_output) + tf.matmul(Wc_o, c) + tf.matmul(Ws_o, s)
y_pred = tf.multiply(Wo, o) + b_o
arr = tf.concat([arr,tf.reshape(y_pred, (1, embedding_size))], 0)
return [arr, y_pred, s, tf.add(step, 1)]
i = tf.constant(0)
each_step = lambda c, a, b, step: tf.less(step, max_summary_len-1)
predss,ltm_pred, lstm_state, _ = tf.while_loop(each_step, decoderStep, [arr, initial_input, initial_state, i] ,shape_invariants=[tf.TensorShape([None, embedding_size]),initial_input.get_shape(),initial_state.get_shape(),i.get_shape()])
preds=tf.expand_dims(predss,0)
loss = tf.losses.mean_squared_error(y_label, preds)
# loss=tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(tf.math.abs(preds)), axis=[0]))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
saver = tf.train.Saver()
init = tf.global_variables_initializer()
这是错误消息:
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1024,8370] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[node while/mul (defined at ta_skenario1.py:226) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[node mean_squared_error/value (defined at ta_skenario1.py:246) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
这是nvidia-smi
的结果:
有帮助吗?谢谢
更新:我只是意识到我的代码总是在Tensorflow(解码器)代码部分得到OOM。在Keras CuDNNGRU中从未获得OOM。有任何建议将我的代码更改为更简单吗?谢谢
更新:我将嵌入大小更改为64,将隐藏单位更改为128,将对齐单位更改为64。OOM错误消失了,但是它花费了很长时间(大约13分钟),持续了1个时间段
答案 0 :(得分:0)
可以减小隐藏层的大小吗?
添加droput图层也可能会有所帮助。