我尝试在jupyter-notebook环境中使用cifar10_train_multiGPU.py的多GPU塔式def到MNNet数据。
- 但是,我不会将movingaveragedecay用于损失和变量
但是,实施这些defs会发生错误,我不知道为什么。 请帮助我理解,如果有,请给我更好的模板,以便完全使用GPU。
以下是错误的来源。
with tf.device('/cpu:0'):
# Setting optimizer
optimizer = OPTIMIZER('GD')
# Calculate the gradients for each model tower.
total_grads = []
for i in range(n_GPU):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
tower_loss = TOWER_LOSS(scope)
tf.get_variable_scope().reuse_variables()
tower_grads = optimizer.compute_gradients(tower_loss)
print(tower_grads)
total_grads.append(tower_grads)
grads = AVERAGE_GRADIENTS(total_grads)
train_op = optimizer.apply_gradients(grads, global_step=global_step)
当我打印(tower_grads)时,第一个设备的结果是好的。 然后,在此之后,出现无。
[(,),(,),(,),(,),(,),(,),(,),(,)] [(None,),(None,),(None,),(None,),(None,),(None,),(None,),(None,),(,),(,),( ,),(,),(,),(,),(,),(,)]
所以,我改变了代码来计算自己范围内的损失。
tower_grads = optimizer.compute_gradients(tower_loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope))
然后,可以运行代码。 但是,损失不会被最小化。
Iter:100 Total_loss:2.62306 Iter:200 Total_loss:2.67096 Iter:300 Total_loss:2.82596 Iter:400 Total_loss:2.76608 Iter:500 Total_loss:2.71014 Iter:600 Total_loss:2.79502 Iter:700 Total_loss:2.8309 Iter:800 Total_loss:2.89564 Iter:900 Total_loss:2.75665 Iter:1000 Total_loss:2.75197 Iter:1100 Total_loss:2.90878 Iter:1200 Total_loss:2.6832 Iter:1300 Total_loss:2.80052
以下是完整代码。
x = tf.placeholder(tf.float32, shape=[None, IMAGE_SIZE*IMAGE_SIZE*IMAGE_CHANNELS], name='x_raw')
y_ = tf.placeholder(tf.float32, shape=[None, LABEL_SIZE], name='label')
# Global_step
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
def INFERENCE(_X):
_x_reshaped = tf.reshape(_X, [-1, IMAGE_SIZE, IMAGE_SIZE, IMAGE_CHANNELS], name='x_reshaped')
C1 = _CONV(_x_reshaped, 5, 64)
S1 = _MAXPOOL(C1)
C2 = _CONV(S1, 5, 128)
S2 = _MAXPOOL(C2)
_size = S2.get_shape()[1].value
_channel_size = S2.get_shape()[3].value
x_flatten = tf.reshape(S2, [-1, _size*_size*_channel_size])
FC3 = tf.nn.relu(_LINEAR(x_flatten, n_out=1024))
_softmax_linear = _LINEAR(FC3, LABEL_SIZE)
return _softmax_linear
def _WEIGHT_WITH_WD(_SHAPE):
_initial = tf.truncated_normal(_SHAPE, stddev=0.05)
_weight = tf.Variable(_initial, name='weight', trainable=True)
return _weight
def _BIAS(_SHAPE):
_initial = tf.constant(0.1, shape=_SHAPE) # dead cell 방지
return tf.Variable(_initial, name='bias', trainable=True)
def _CONV(_X, kernel_size, out_channel_size, stride_size=1, wd=WD_CONV):
_in_channels = _X.get_shape()[3].value
_kernel_shape = [kernel_size, kernel_size, _in_channels, out_channel_size]
_kernel = _WEIGHT_WITH_WD(_kernel_shape)
_stride = [1, stride_size, stride_size, 1]
_conv = tf.nn.conv2d(_X, _kernel, _stride, padding='SAME')
_conv_activated = tf.nn.relu(_conv + _BIAS([out_channel_size]))
return _conv_activated
def _MAXPOOL(_X, kernel_size=2, stride_size=2):
_kernel_shape = [1, kernel_size, kernel_size, 1]
_stride = [1, stride_size, stride_size, 1]
_max_pool = tf.nn.max_pool(_X, _kernel_shape, _stride, padding='SAME')
return _max_pool
def _LINEAR(_X, n_out):
_weight = _WEIGHT_WITH_WD([_X.get_shape()[1].value, n_out])
_bias = _BIAS([n_out])
_linear = tf.matmul(_X, _weight) + _bias
return _linear
def OPTIMIZER(optimizer='GD'):
_lr = LR
_optimizer_dic = {'GD':tf.train.GradientDescentOptimizer(_lr), 'ADAM':tf.train.AdamOptimizer(_lr)}
_opt = _optimizer_dic[optimizer]
return _opt
def TOTAL_LOSS(logits, labels):
with tf.name_scope('Loss') as scope:
_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, labels, name='cross_entropy_per_example')
_cross_entropy_mean = tf.reduce_mean(_cross_entropy, name='cross_entropy_per_batch')
tf.add_to_collection('losses', _cross_entropy_mean)
_total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
return _total_loss
def TOWER_LOSS(scope):
_logits = INFERENCE(x)
_ = TOTAL_LOSS(_logits, y_)
_tower_loss_list = tf.get_collection('losses', scope)
_tower_loss = tf.add_n(_tower_loss_list, name='tower_loss')
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(_tower_loss_list + [_tower_loss])
with tf.control_dependencies([loss_averages_op]):
_tower_loss = tf.identity(_tower_loss)
return _tower_loss
def AVERAGE_GRADIENTS(total_grads):
average_grads = []
for grad_n_vars in zip(*total_grads):
grads = []
for g, _ in grad_n_vars:
expanded_g = tf.expand_dims(g, 0)
grads.append(expanded_g)
grad = tf.concat(0, grads)
grad = tf.reduce_sum(grad, reduction_indices=0)
var = grad_n_vars[0][1]
grad_n_var = (grad, var)
average_grads.append(grad_n_var)
return average_grads
with tf.device('/cpu:0'):
# Setting optimizer
optimizer = OPTIMIZER('GD')
# Calculate the gradients for each model tower.
total_grads = []
for i in range(n_GPU):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
tower_loss = TOWER_LOSS(scope)
tf.get_variable_scope().reuse_variables()
tower_grads = optimizer.compute_gradients(tower_loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope))
total_grads.append(tower_grads)
grads = AVERAGE_GRADIENTS(total_grads)
train_op = optimizer.apply_gradients(grads, global_step=global_step)
## Launch the graph
with tf.Session(config=CONFIG) as sess:
sess.run(tf.initialize_all_variables())
# Start the queue runners.
tf.train.start_queue_runners(sess=sess)
# training
for i in range(MAX_ITER):
if i%PRINT_BY != 0 or i==0:
tr_x, tr_y_ = mnist.train.next_batch(BATCH_SIZE)
sess.run(train_op, feed_dict={x:tr_x, y_:tr_y_})
else:
## print
gstep, tower_loss_ = sess.run([global_step, tower_loss], feed_dict={x:tr_x, y_:tr_y_})
print("GStep: %d \tIter :%d \tTotal_loss: %g" \
%(gstep, i, tower_loss_))
答案 0 :(得分:2)
从您的输出中,看起来每个塔都在创建一组新的变量:
[(, ), (, ), (, ), (, ), (, ), (, ), (, ), (, )]
[(None, ), (None, ), (None, ), (None, ), (None, ), (None, ), (None, ), (None, ), (, ), (, ), (, ), (, ), (, ), (, ), (, ), (, )]
请注意,第二个塔的输出中有两倍的变量(16对8),前8个变量在第二个塔中没有梯度。
有两种可能的解决方法:
确保您的TOWER_LOSS()
功能包含with tf.variable_scope(...):
个阻止。有关示例,请参阅cifar10.inference()
函数。您需要创建tf.variable_scope()
才能使用变量共享。
使用tf.get_variable()
代替tf.Variable()
,以确保尽可能共享变量。以下是在{CIFAR-10中完成此操作的example。
有关共享变量的更多详细信息,请参阅how-to guide。