我正在尝试并行化我的代码,以使我的tensorflow模型在多个GPU上运行。出于某种原因,我编写的用于并行化训练的代码用于标准深度神经网络,但在使用卷积神经网络时会抛出错误。
这是我计算平均梯度的代码:
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
这是我的深度神经网络架构:(这有效)
def neuralNet(data):
hl_1 = {'weights':tf.get_variable('Weights1',[TF_SHAPE,n_nodes_hl1],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases1',[n_nodes_hl1],initializer=tf.random_normal_initializer())}
hl_2 = {'weights':tf.get_variable('Weights2',[n_nodes_hl1, n_nodes_hl2],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases2',[n_nodes_hl2],initializer=tf.random_normal_initializer())}
hl_3 = {'weights':tf.get_variable('Weights3',[n_nodes_hl2, n_nodes_hl3],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases3',[n_nodes_hl3],initializer=tf.random_normal_initializer())}
hl_4 = {'weights':tf.get_variable('Weights4',[n_nodes_hl3, n_nodes_hl4],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases4',[n_nodes_hl4],initializer=tf.random_normal_initializer())}
hl_5 = {'weights':tf.get_variable('Weights5',[n_nodes_hl4, n_nodes_hl5],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases5',[n_nodes_hl5],initializer=tf.random_normal_initializer())}
output_layer = {'weights':tf.get_variable('Weights-outputlayer',[n_nodes_hl5, n_classes],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases-outputlayer',[n_classes],initializer=tf.random_normal_initializer())}
l1 = tf.add(tf.matmul(data, hl_1['weights']), hl_1['biases'])
l1 = tf.nn.sigmoid(l1, name='op1')
l2 = tf.add(tf.matmul(l1, hl_2['weights']), hl_2['biases'])
l2 = tf.nn.sigmoid(l2, name='op2')
l3 = tf.add(tf.matmul(l2, hl_3['weights']), hl_3['biases'])
l3 = tf.nn.sigmoid(l3, name='op3')
l4 = tf.add(tf.matmul(l3, hl_4['weights']), hl_4['biases'])
l4 = tf.nn.sigmoid(l4, name='op4')
l5 = tf.add(tf.matmul(l4, hl_5['weights']), hl_5['biases'])
l5 = tf.nn.sigmoid(l5, name='op5')
dropout = tf.nn.dropout(l5,keep_prob, name='op6')
ol = tf.add(tf.matmul(dropout, output_layer['weights']), output_layer['biases'], name='op7')
return ol
这是我的信条:(这不起作用)
def conv2d(x,W):
return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')
def maxpool2d(x):
return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding="SAME")
def convNeuralNet(x):
weights = {'w_conv1':tf.get_variable('w_conv1',[7,7,1,2],initializer=tf.random_normal_initializer()),
'w_conv2':tf.get_variable('w_conv2',[7,7,2,4],initializer=tf.random_normal_initializer()),
'w_conv3':tf.get_variable('w_conv3',[7,7,4,8],initializer=tf.random_normal_initializer()),
'w_conv4':tf.get_variable('w_conv4',[7,7,8,16],initializer=tf.random_normal_initializer()),
'w_conv5':tf.get_variable('w_conv5',[7,7,16,32],initializer=tf.random_normal_initializer()),
'w_conv6':tf.get_variable('w_conv6',[7,7,32,64],initializer=tf.random_normal_initializer()),
'w_conv7':tf.get_variable('w_conv7',[7,7,64,128],initializer=tf.random_normal_initializer()),
'w_conv8':tf.get_variable('w_conv8',[7,7,128,256],initializer=tf.random_normal_initializer()),
'w_conv9':tf.get_variable('w_conv9',[7,7,256,512],initializer=tf.random_normal_initializer()),
'w_fc1':tf.get_variable('w_fc1',[512,1024],initializer=tf.random_normal_initializer()),
'w_fc2':tf.get_variable('w_fc2',[1024,2048],initializer=tf.random_normal_initializer()),
'w_fc3':tf.get_variable('w_fc3',[2048,4096],initializer=tf.random_normal_initializer()),
'out':tf.get_variable('w_out',[4096,n_classes],initializer=tf.random_normal_initializer())}
biases = {'b_conv1':tf.get_variable('b_conv1',[2],initializer=tf.random_normal_initializer()),
'b_conv2':tf.get_variable('b_conv2',[4],initializer=tf.random_normal_initializer()),
'b_conv3':tf.get_variable('b_conv3',[8],initializer=tf.random_normal_initializer()),
'b_conv4':tf.get_variable('b_conv4',[16],initializer=tf.random_normal_initializer()),
'b_conv5':tf.get_variable('b_conv5',[32],initializer=tf.random_normal_initializer()),
'b_conv6':tf.get_variable('b_conv6',[64],initializer=tf.random_normal_initializer()),
'b_conv7':tf.get_variable('b_conv7',[128],initializer=tf.random_normal_initializer()),
'b_conv8':tf.get_variable('b_conv8',[256],initializer=tf.random_normal_initializer()),
'b_conv9':tf.get_variable('b_conv9',[512],initializer=tf.random_normal_initializer()),
'b_fc1':tf.get_variable('b_fc1',[1024],initializer=tf.random_normal_initializer()),
'b_fc2':tf.get_variable('b_fc2',[2048],initializer=tf.random_normal_initializer()),
'b_fc3':tf.get_variable('b_fc3',[4096],initializer=tf.random_normal_initializer()),
'out':tf.get_variable('b_out',[n_classes],initializer=tf.random_normal_initializer())}
x = tf.reshape(x,shape=[-1,7,len_puzzle,1])
conv1 = conv2d(x, weights['w_conv1'])
conv1 = maxpool2d(conv1)
conv2 = conv2d(conv1, weights['w_conv2'])
conv2 = maxpool2d(conv2)
conv3 = conv2d(conv2, weights['w_conv3'])
conv3 = maxpool2d(conv3)
conv4 = conv2d(conv3, weights['w_conv4'])
conv4 = maxpool2d(conv4)
conv5 = conv2d(conv4, weights['w_conv5'])
conv5 = maxpool2d(conv5)
conv6 = conv2d(conv5, weights['w_conv6'])
conv6 = maxpool2d(conv6)
conv7 = conv2d(conv6, weights['w_conv7'])
conv7 = maxpool2d(conv7)
conv8 = conv2d(conv7, weights['w_conv8'])
conv8 = maxpool2d(conv8)
conv9 = conv2d(conv8, weights['w_conv9'])
conv9 = maxpool2d(conv9)
fc1 = tf.reshape(conv9, [-1,512])
fc1 = tf.nn.sigmoid(tf.add(tf.matmul(fc1,weights['w_fc1']),biases['b_fc1']))
fc2 = tf.nn.sigmoid(tf.add(tf.matmul(fc1,weights['w_fc2']),biases['b_fc2']))
fc3 = tf.nn.sigmoid(tf.add(tf.matmul(fc2,weights['w_fc3']),biases['b_fc3']))
last = tf.nn.dropout(fc3,keep_prob)
output = tf.add(tf.matmul(last, weights['out']), biases['out'], name='op7')
return output
这是运行会话的代码:
def train(x):
tower_grads = []
opt = tf.train.AdamOptimizer(learning_rate)
for i in xrange(2):
with tf.device('/gpu:%d' % i):
with tf.variable_scope('NN',reuse=i>0):
prediction = convNeuralNet(x)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
tf.summary.scalar('cross_entropy',cost)
grads = opt.compute_gradients(cost)
tower_grads.append(grads)
print grads
print len(grads)
#scope.reuse_variables()
grads = average_gradients(tower_grads)
apply_gradient_op = opt.apply_gradients(grads)
train_op = tf.group(apply_gradient_op)
correct = tf.equal(tf.argmax(prediction,1),tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct,'float'))
tf.summary.scalar('accuracy',accuracy)
num_epochs = ne
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
saver = tf.train.Saver()
# UNCOMMENT THIS WHEN RESTARTING FROM Checkpoint
#saver.restore(sess, tf.train.latest_checkpoint(os.getcwd()+'/models/base/.'))
sess.run(tf.global_variables_initializer())
merged_summary = tf.summary.merge_all()
for epoch in range(num_epochs):
epoch_loss = 0
for i in range(int(real_X_9.shape[0])/batch_size):#mnist.train.num_examples/batch_size)): # X.shape[0]
randidx = np.random.choice(real_X_9.shape[0], batch_size, replace=False)
epoch_x,epoch_y = real_X_9[randidx,:],real_y_9[randidx,:] #mnist.train.next_batch(batch_size) # X,y
j,c = sess.run([train_op,cost],feed_dict={x:epoch_x,y:epoch_y,keep_prob:TRAIN_KEEP_PROB})
if i == 0:
[ta] = sess.run([accuracy],feed_dict={x:epoch_x,y:epoch_y,keep_prob:TRAIN_KEEP_PROB})
print 'Train Accuracy', ta
epoch_loss += c
print '\n','Epoch', epoch + 1, 'completed out of', num_epochs, '\nLoss:',epoch_loss
#saver.save(sess, os.getcwd()+'/models/base/baseDNN7')
#saver.export_meta_graph(os.getcwd()+'/models/base/baseDNN7.meta')
print '\n','Train Accuracy', accuracy.eval(feed_dict={x:real_X_9, y:real_y_9, keep_prob:TRAIN_KEEP_PROB})
print '\n','Test Accuracy', accuracy.eval(feed_dict={x:test_real_X, y:test_real_y, keep_prob:1.0}) #X, y #mnist.test.images, mnist.test.labels
train(x)
这是错误:
Traceback (most recent call last):
File "CNN_gpu.py", line 393, in <module>
train(x)
File "CNN_gpu.py", line 311, in train
grads = average_gradients(tower_grads)
expanded_g = tf.expand_dims(g, 0)
File "/share/sw/free/tensorflow.1/1.1.0/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 170, in expand_dims
return gen_array_ops._expand_dims(input, axis, name)
File "/share/sw/free/tensorflow.1/1.1.0/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 900, in _expand_dims
result = _op_def_lib.apply_op("ExpandDims", input=input, dim=dim, name=name)
File "/share/sw/free/tensorflow.1/1.1.0/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 509, in apply_op
(input_name, err))
ValueError: Tried to convert 'input' to a tensor and failed. Error: None values not supported.
我真的很困惑。无论使用何种类型的神经网络,跨多个GPU的并行化都应该有效。
任何帮助都将不胜感激。