Tensorflow Multi-GPU LSTM:ValueError:不支持任何值

时间:2017-09-30 02:42:10

标签: tensorflow lstm multi-gpu gradients

我正在尝试使用Tensorflow实现多GPU LSTM,该Tensorflow基于存在的cifar10代码here

这是完整的代码: 我首先定义了LSTM CELL:

def lstm_cell(X, reuse):
# Define a scope for reusing the variables
with tf.variable_scope('lstm', reuse=reuse): 
    weights={
         'in':tf.Variable(tf.random_normal([num_input_features,hidden_units])),
         'out':tf.Variable(tf.random_normal([hidden_units,1]))
         }
    biases={
        'in':tf.Variable(tf.constant(0.1,shape=[hidden_units,])),
        'out':tf.Variable(tf.constant(0.1,shape=[1]))
       }
     ....
     # the rest of the code to create a lstm cell and pred variable

return pred

计算一个塔中损失的函数:

  def total_loss(pred_train,Y,reuse):
        loss_op=tf.reduce_mean(tf.square(tf.reshape(pred_train,[-1])-tf.reshape(Y, [-1])))
        tf.add_to_collection('losses', loss_op)
        _total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
        return _total_loss

在所有塔中造成全球损失的功能

def tower_loss(scope,X,Y,reuse):
  pred_train =lstm_cell(X,reuse)
  _= total_loss(pred_train,Y,reuse)
  _tower_loss_list = tf.get_collection('losses', scope)
  _tower_loss = tf.add_n(_tower_loss_list, name='tower_loss')

  return _tower_loss

平均渐变

的函数
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
    # Note that each grad_and_vars looks like the following:
    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
    grads = []
    for g, _ in grad_and_vars:
        # Add 0 dimension to the gradients to represent the tower.
        expanded_g = tf.expand_dims(g, 0)

        # Append on a 'tower' dimension which we will average over below.
        grads.append(expanded_g)

    # Average over the 'tower' dimension.
    grad = tf.concat(grads, 0)
    grad = tf.reduce_mean(grad, 0)

    # Keep in mind that the Variables are redundant because they are shared
    # across towers. So .. we will just return the first tower's pointer to
    # the Variable.
    v = grad_and_vars[0][1]
    grad_and_var = (grad, v)
    average_grads.append(grad_and_var)
return average_grads

最后是培训:

 # Place all ops on CPU by default
 def training():
  with tf.Graph().as_default(),tf.device('/cpu:0'):
      tower_grads = []
      reuse_vars = False

      # tf Graph input
      X=tf.placeholder(tf.float32, shape=[None,time_step,num_input_features],name='X') 
      Y=tf.placeholder(tf.float32, shape=[None,time_step,output_size],name='Y')

     tf.train.GradientDescentOptimizer(learning_rate=decay_learning_rate)
      # Loop over all GPUs and construct their own computation graph
      for i in range(num_gpus):
          with tf.device('/gpu:%d' % i):
             with tf.name_scope('Tower_%d'%i) as scope:
              # Split data between GPUs
              _x = X[i * batch_size: (i+1) * batch_size]
              _y = Y[i * batch_size: (i+1) * batch_size]

              loss_op = tower_loss(scope,_x,_y,reuse=reuse_vars)

              # Reuse variables for the next tower.
              tf.get_variable_scope().reuse_variables()

              grads = optimizer.compute_gradients(loss_op)

              reuse_vars = True
              tower_grads.append(grads)

      tower_grads_avg = average_gradients(tower_grads)
      apply_gradient_op = optimizer.apply_gradients(tower_grads_avg,global_step=global_step)
      train_op = apply_gradient_op


      init = tf.global_variables_initializer()

      # Start Training
      with tf.Session() as sess:
          print ('----Training the LSTM With Multi GPU------')
          # Run the initializer
          sess.run(init)
          ts = time.time()
          for i in range(max_iters):
                     for epoch in range(len(epoch_start_index)//num_gpus): 
                         _,loss_=sess.run([train_op,loss_op,],feed_dict = \
                                         {X:train_x[epoch_start_index[epoch]:epoch_start_index[epoch]+batch_size*num_gpus],\
                                          Y:train_y[epoch_start_index[epoch]:epoch_start_index[epoch]+batch_size*num_gpus]})
                     if i % 10 == 0:
                             print(i,loss_)

          print("Optimization Finished in %s seconds!" %(time.time()-ts))

该实现给出了以下错误:

Traceback (most recent call last):
File "multi_gpu_lstm.py", line 228, in <module>
tf.app.run()
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "multi_gpu_lstm.py", line 225, in main
training()
File "multi_gpu_lstm.py", line 165, in training
tower_grads_avg = average_gradients(tower_grads)
File "multi_gpu_lstm.py", line 105, in average_gradients
expanded_g = tf.expand_dims(g, 0)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 168, in expand_dims
return gen_array_ops._expand_dims(input, axis, name)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1051, in _expand_dims
result = _op_def_lib.apply_op("ExpandDims", input=input, dim=dim, name=name)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 504, in apply_op
values, as_ref=input_arg.is_ref).dtype.name
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 702, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/constant_op.py", line 110, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/constant_op.py", line 99, in constant
tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/BIGDATA/app/TensorFlow/python-venv/py2.9-gpu/lib/python2.7/site-packages/tensorflow/python/framework/tensor_util.py", line 360, in make_tensor_proto
 raise ValueError("None values not supported.")

ValueError:不支持任何值。

问题出现在这一行:

expanded_g = tf.expand_dims(g, 0)

这是因为g获得None值。如果我修改为:

if g is not None:
    expanded_g = tf.expand_dims(g, 0)
一切都很好。 经过调查,我发现毕业生在这一行获得无价值:

grads = optimizer.compute_gradients(loss_op)

当我打印毕业生时,这就是我得到的:

[(<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/MatMul_grad/tuple/control_dependency_1:0' shape=(28, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac80ff32110>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/MatMul_1_grad/tuple/control_dependency_1:0' shape=(1, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac810ad1f90>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/add_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac810ad19d0>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/add_1_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac810ae9d50>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/lstm_cell/MatMul/Enter_grad/b_acc_3:0' shape=(2, 4) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814428c50>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/BiasAdd/Enter_grad/b_acc_3:0' shape=(4,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814438bd0>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814446650>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul_2/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814446690>), (<tf.Tensor 'Tower_0/gradients/Tower_0/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul_4/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8144466d0>)]
  [(None, <tensorflow.python.ops.variables.Variable object at 0x2ac80ff32110>), (None, <tensorflow.python.ops.variables.Variable object at 0x2ac810ad1f90>), (None, <tensorflow.python.ops.variables.Variable object at 0x2ac810ad19d0>), (None, <tensorflow.python.ops.variables.Variable object at 0x2ac810ae9d50>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/lstm_cell/MatMul/Enter_grad/b_acc_3:0' shape=(2, 4) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814428c50>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/BiasAdd/Enter_grad/b_acc_3:0' shape=(4,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814438bd0>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814446650>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul_2/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac814446690>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/rnn/while/multi_rnn_cell/cell_0/lstm_cell/mul_4/Enter_grad/b_acc_3:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8144466d0>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/MatMul_grad/tuple/control_dependency_1:0' shape=(28, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac80ff32150>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/MatMul_1_grad/tuple/control_dependency_1:0' shape=(1, 1) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8152d9b90>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/add_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8152d9c50>), (<tf.Tensor 'Tower_1/gradients/Tower_1/lstm/add_1_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>, <tensorflow.python.ops.variables.Variable object at 0x2ac8152fd090>)]

我们可以清楚地看到毕业生的一些无价值

我不知道为什么但是它似乎很奇怪。我认为它可能与某些错误设置的变量有关。 请帮助我找到

0 个答案:

没有答案