参考用于多个GPU的cifar10演示,我尝试为MNIST CNN分类器编写多gpu(多塔式)代码。但是,这给我带来了非常低的精度,并且速度也没有改善。 (对我来说,必须在多个GPU上进行操作。)
感谢您为查找准确性低下的根本原因提供帮助。以下是张量板的代码和快照。预先感谢。
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import datetime
import time
import numpy as np
#Make Sure it sees the devices
from tensorflow.python.client import device_lib
print("*******Device Info Seen By TensorFlow*******")
print(device_lib.list_local_devices())
print("********************************************")
#times = []
def check_available_gpus():
local_devices = device_lib.list_local_devices()
gpu_names = [x.name for x in local_devices if x.device_type == 'GPU']
gpu_num = len(gpu_names)
print('{0} GPUs are detected : {1}'.format(gpu_num, gpu_names))
return gpu_num
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
def conv2d(xx, W):
return tf.nn.conv2d(xx, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(xx):
return tf.nn.max_pool(xx, ksize=[1, 2, 2, 1],strides=[1, 2, 2, 1], padding='SAME')
def tower_loss(cross_entropy_mean,scope):
tf.add_to_collection('losses', cross_entropy_mean)
losses = tf.get_collection('losses', scope)
total_loss = tf.add_n(losses, name='total_loss')
return total_loss
def train():
with tf.Graph().as_default(),tf.device('/cpu:0'):
gpu_num = check_available_gpus()
batch_size = 150
mnist = input_data.read_data_sets(".", one_hot=True)
times = []
run_metadata = tf.RunMetadata()
x = tf.placeholder(tf.float32, [None, 784], name='x')
x_img=tf.reshape(x, [-1, 28, 28, 1])
x_dict={}
x_dict = dict(zip(['x'+str((i)) for i in range(gpu_num)],tf.split(x_img,gpu_num)))
y_dict={}
y = tf.placeholder(tf.float32, [None, 10], name='y')
y_dict = dict(zip(['y'+str((i)) for i in range(gpu_num)],tf.split(y,gpu_num)))
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
opt=tf.train.AdamOptimizer(1e-4)
keep_prob = tf.placeholder(tf.float32)
grads=[]
w0=tf.get_variable('w0',initializer=tf.truncated_normal([5, 5,1,32], stddev=0.1),trainable=True)
b0=tf.get_variable('b0',initializer=tf.zeros([32]),trainable=True)
w1=tf.get_variable('w1',initializer=tf.truncated_normal([5,5,32,64], stddev=0.1),trainable=True)
b1=tf.get_variable('b1',initializer=tf.zeros([64]),trainable=True)
w2=tf.get_variable('w2',initializer=tf.truncated_normal([7*7*64,1024], stddev=0.1),trainable=True)
b2=tf.get_variable('b2',initializer=tf.zeros([1024]),trainable=True)
w3=tf.get_variable('w3',initializer=tf.truncated_normal([1024,10], stddev=0.1),trainable=True)
b3=tf.get_variable('b3',initializer=tf.zeros([10]),trainable=True)
with tf.variable_scope(tf.get_variable_scope()):
for i in range(0,gpu_num):
with tf.device(('/gpu:{0}').format(i)):
with tf.name_scope(('scope_gpu_{0}').format(i)) as infer_scope:
#batch_x, batch_y = mnist.train.next_batch(batch_size)
h_conv1=tf.nn.relu(conv2d(x_dict[('x{0}').format(i)],w0)+b0);
h_pool1=max_pool_2x2(h_conv1)
h_conv2=tf.nn.relu(conv2d(h_pool1,w1)+b1);
h_pool2=max_pool_2x2(h_conv2)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,w2)+b2)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
yy = tf.matmul(h_fc1_drop,w3)+b3
tf.get_variable_scope().reuse_variables()
loss = tower_loss(tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=yy, labels=y_dict[('y{0}').format(i)])),infer_scope)
grads.append(opt.compute_gradients(loss,tf.trainable_variables()))
grad = average_gradients(grads)
apply_grad_op = opt.apply_gradients(grad,global_step=global_step)
variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
# Group all updates to into a single train op.
train_op = tf.group(apply_grad_op, variables_averages_op)
correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y_dict['y0'], 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
sess.as_default()
sess.run(tf.global_variables_initializer(),options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),run_metadata=run_metadata)
t1_1 = datetime.datetime.now()
for step in range(0,5000):
batch_x, batch_y = mnist.train.next_batch(batch_size)
start_time = time.time()
#sess.run(train_step, feed_dict={x: batch_x, y: batch_y, keep_prob: 0.5})
_, loss_val = sess.run([train_op, loss], {x: batch_x, y: batch_y, keep_prob: 0.5})
#print(sess.run(w0))
train_time_step = time.time() - start_time
times.append(train_time_step)
assert not np.isnan(loss_val), 'Model diverged with loss = NaN'
if (step % 100) == 0:
train_accuracy = accuracy.eval(feed_dict={x:batch_x, y: batch_y, keep_prob: 1.0})
print("step %d, training accuracy %g"%(step, train_accuracy))
#print(step, sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1.0}))
#print(sess.run(w0))
t2_1 = datetime.datetime.now()
print(sess.run(w0))
print(sess.run(w1))
print(sess.run(w2))
print(sess.run(w3))
print(sess.run(b0))
print(sess.run(b1))
print(sess.run(b2))
print(sess.run(b3))
print("test accuracy %g"%accuracy.eval(feed_dict={x: mnist.test.images, y: mnist.test.labels, keep_prob: 1.0}))
print("Total Computation time: " + str(t2_1-t1_1))
from tensorflow.python.client import timeline
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
trace_file = open('timeline.ctf.json', 'w')
trace_file.write(trace.generate_chrome_trace_format())
#Write Graph
writer = tf.summary.FileWriter('./my_graph',sess.graph)
writer.close()
#sess.close()
for device in run_metadata.step_stats.dev_stats:
print(device.device)
for node in device.node_stats:
print(" ", node.node_name)
times = np.array(times)
speeds = batch_size/times
speed_mean = batch_size/np.mean(times)
speed_uncertainty = np.std(speeds)/np.sqrt(float(len(speeds)))
speed_jitter = 1.4826 * np.median(np.abs(speeds - np.median(speeds)))
print("Mean Time Per Step : " + str(np.mean(times)))
print("Mean Speed : " + str(speed_mean) + " Images/Sec")
print("speed uncertainty : " + str(speed_uncertainty))
print("Speed Jitter : " + str(speed_jitter))
def main():
train()
if __name__ == "__main__":
main()
答案 0 :(得分:1)
这是固定的。问题在于准确性的汇总,正确的预测和梯度,由于这些原因导致更新变量操作无法正确更新所有变量,也无法正确计算准确性。解决这些问题后,我可以获得很好的准确性(测试数据> 99%),并且可以使用多个GPU。非常感谢大家的帮助。
答案 1 :(得分:0)
我想知道问题是否与数字错误无关。 float32
的准确性受到很大限制,汇总来自众多GPU的结果可能会引入使模型无法收敛的错误(尤其是您执行了许多累积错误的数组操作-例如平均)。
为了确认这个假设,您可以使用1、2、4、8个GPU来运行代码,并查看精度是否随GPU数量而降低。如果确实如此,那么数字错误很可能归咎于此。提高准确性并检查最终结果是否有变化也是很好的测试。
答案 2 :(得分:0)
我看一下您的代码,发现您使用w0, b0 ... w3, b3
定义了tf.get_variable
,但与GPU循环无关。
尝试在with tf.variable_scope(('scope_gpu_{0}').format(i))
范围内内部定义变量。
由于您使用tf.get_variable_scope().reuse_variables()
,因此所有变量将在塔之间共享(尽管您似乎为每个塔都在循环中定义了变量)。
编辑:
看这行:
correct_prediction = tf.equal(tf.argmax(yy, 1), tf.argmax(y_dict['y0'], 1))
这里yy
是GPU循环中的最后一次登录,包含来自 last GPU的预测。但是y_dict['y0']
是第一个 GPU标签。
要解决此问题,您应该像进行渐变一样列出correct_prediction
列表,然后对其求平均值。