我正在尝试使用Tensorflow实现多GPU LSTM,该Tensorflow基于存在的cifar10代码here。
这是完整的代码: 我首先定义了LSTM CELL:
def lstm_cell(X, reuse):
# Define a scope for reusing the variables
with tf.variable_scope('lstm', reuse=reuse):
weights={
'in':tf.Variable(tf.random_normal([num_input_features,hidden_units])),
'out':tf.Variable(tf.random_normal([hidden_units,1]))
}
biases={
'in':tf.Variable(tf.constant(0.1,shape=[hidden_units,])),
'out':tf.Variable(tf.constant(0.1,shape=[1]))
}
....
# the rest of the code to create a lstm cell and pred variable
return pred
计算一个塔中损失的函数:
def total_loss(pred_train,Y,reuse):
loss_op=tf.reduce_mean(tf.square(tf.reshape(pred_train,[-1])-tf.reshape(Y, [-1])))
tf.add_to_collection('losses', loss_op)
_total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
return _total_loss
在所有塔中造成全球损失的功能
def tower_loss(scope,X,Y,reuse):
pred_train =lstm_cell(X,reuse)
_= total_loss(pred_train,Y,reuse)
_tower_loss_list = tf.get_collection('losses', scope)
_tower_loss = tf.add_n(_tower_loss_list, name='tower_loss')
return _tower_loss
平均渐变
的函数def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(grads, 0)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
最后是培训:
# Place all ops on CPU by default
def training():
with tf.Graph().as_default(),tf.device('/cpu:0'):
tower_grads = []
reuse_vars = False
# tf Graph input
X=tf.placeholder(tf.float32, shape=[None,time_step,num_input_features],name='X')
Y=tf.placeholder(tf.float32, shape=[None,time_step,output_size],name='Y')
tf.train.GradientDescentOptimizer(learning_rate=decay_learning_rate)
# Loop over all GPUs and construct their own computation graph
for i in range(num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('Tower_%d'%i) as scope:
# Split data between GPUs
_x = X[i * batch_size: (i+1) * batch_size]
_y = Y[i * batch_size: (i+1) * batch_size]
loss_op = tower_loss(scope,_x,_y,reuse=reuse_vars)
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
grads = optimizer.compute_gradients(loss_op)
reuse_vars = True
tower_grads.append(grads)
tower_grads_avg = average_gradients(tower_grads)
apply_gradient_op = optimizer.apply_gradients(tower_grads_avg,global_step=global_step)
train_op = apply_gradient_op
init = tf.global_variables_initializer()
# Start Training
with tf.Session() as sess:
print ('----Training the LSTM With Multi GPU------')
# Run the initializer
sess.run(init)
ts = time.time()
for i in range(max_iters):
for epoch in range(len(epoch_start_index)//num_gpus):
_,loss_=sess.run([train_op,loss_op,],feed_dict = \
{X:train_x[epoch_start_index[epoch]:epoch_start_index[epoch]+batch_size*num_gpus],\
Y:train_y[epoch_start_index[epoch]:epoch_start_index[epoch]+batch_size*num_gpus]})
if i % 10 == 0:
print(i,loss_)
print("Optimization Finished in %s seconds!" %(time.time()-ts))
该实现给出了以下错误:
Traceback (most recent call last):
File "multi_gpu_lstm.py", line 228, in <module>
tf.app.run()
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "multi_gpu_lstm.py", line 225, in main
training()
File "multi_gpu_lstm.py", line 165, in training
tower_grads_avg = average_gradients(tower_grads)
File "multi_gpu_lstm.py", line 105, in average_gradients
expanded_g = tf.expand_dims(g, 0)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 168, in expand_dims
return gen_array_ops._expand_dims(input, axis, name)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1051, in _expand_dims
result = _op_def_lib.apply_op("ExpandDims", input=input, dim=dim, name=name)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 504, in apply_op
values, as_ref=input_arg.is_ref).dtype.name
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 702, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/constant_op.py", line 110, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/constant_op.py", line 99, in constant
tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/tensor_util.py", line 360, in make_tensor_proto
raise ValueError("None values not supported.")
ValueError:不支持任何值。
问题出现在这一行:
expanded_g = tf.expand_dims(g, 0)
这是因为g获得None值。如果我修改为:
if g is not None:
expanded_g = tf.expand_dims(g, 0)
一切都很好。
经过调查,我发现毕业生在这一行获得无价值:
grads = optimizer.compute_gradients(loss_op)
当我打印毕业生时,这就是我得到的:
[(<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/MatMul_grad/tuple/control_dependency_1:0' shape=(28, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac80ff32110>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/MatMul_1_grad/tuple/control_dependency_1:0' shape=(1, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac810ad1f90>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/add_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac810ad19d0>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/add_1_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac810ae9d50>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/lstm_cell/MatMul/Enter_grad/b_acc_3:0' shape=(2, 4) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814428c50>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/BiasAdd/Enter_grad/b_acc_3:0' shape=(4,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814438bd0>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814446650>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul_2/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814446690>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul_4/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8144466d0>)]
[(None, <tensorflow.python.ops.variables.Variable object at 0x2ac80ff32110>), (None, <tensorflow.python.ops.variables.Variable object at 0x2ac810ad1f90>), (None, <tensorflow.python.ops.variables.Variable object at 0x2ac810ad19d0>), (None, <tensorflow.python.ops.variables.Variable object at 0x2ac810ae9d50>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/lstm_cell/MatMul/Enter_grad/b_acc_3:0' shape=(2, 4) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814428c50>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/BiasAdd/Enter_grad/b_acc_3:0' shape=(4,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814438bd0>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814446650>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul_2/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814446690>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul_4/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8144466d0>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/MatMul_grad/tuple/control_dependency_1:0' shape=(28, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac80ff32150>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/MatMul_1_grad/tuple/control_dependency_1:0' shape=(1, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8152d9b90>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/add_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8152d9c50>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/add_1_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8152fd090>)]
我们可以清楚地看到毕业生的一些无价值
我不知道为什么但是它似乎很奇怪。我认为它可能与某些错误设置的变量有关。 请帮助我找到