我试图在分布式模式(horovod + tensorflow)上在imagenet数据集上训练alexnet_v2模型形式的疤痕。所有的全连接层已在alexnet_v2中转换为conv2d层。我遇到一些棘手的问题。首先,原始alexnet_v2模型的损失很难收敛,然后我在每个conv2d层之后添加bn层。在经过100个周期后,火车精度几乎为0.6,而验证精度仅为0.35,与火车损耗相比,损耗更大。 这是我的alexnet_v2:
trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev)
def alexnet_v2_arg_scope(weight_decay=0.02, is_training = True):
batch_norm_params = {
'is_training' : is_training,
'decay' : 0.9,
'epsilon' : 0.0001,
'updates_collections':tf.GraphKeys.UPDATE_OPS
}
with slim.arg_scope([slim.conv2d, slim.fully_connected],
activation_fn=tf.nn.relu,
biases_initializer=tf.constant_initializer(0.1),
weights_regularizer=slim.l2_regularizer(weight_decay),
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params
):
with slim.arg_scope([slim.conv2d], padding='SAME'):
with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
return arg_sc
def alexnet_v2(inputs,
num_classes=1000,
is_training=True,
dropout_keep_prob=0.5,
spatial_squeeze=False,
scope='alexnet_v2',
global_pool=False):
with tf.variable_scope(scope, 'alexnet_v2',inputs],reuse=tf.AUTO_REUSE) as sc:
end_points_collection = sc.original_name_scope + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d.
with slim.arg_scope([slim.conv2d,
slim.fully_connected,slim.max_pool2d],
outputs_collections=[end_points_collection]
):
net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1')
net = slim.max_pool2d(net, [3, 3], 2, scope='pool1')
net = slim.conv2d(net, 192, [5, 5], scope='conv2')
net = slim.max_pool2d(net, [3, 3], 2, scope='pool2')
net = slim.conv2d(net, 384, [3, 3], scope='conv3')
net = slim.conv2d(net, 384, [3, 3], scope='conv4')
net = slim.conv2d(net, 256, [3, 3], scope='conv5')
net = slim.max_pool2d(net, [3, 3], 2, scope='pool5')
# Use conv2d instead of fully_connected layers.
with slim.arg_scope([slim.conv2d],
weights_initializer=trunc_normal(0.005),
biases_initializer=tf.constant_initializer(0.1),
normalizer_fn=None
):
net = slim.conv2d(net, 4096, [5, 5], padding='VALID',
scope='fc6')
net = slim.dropout(net, dropout_keep_prob, is_training=is_training,scope='dropout6')
net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
# Convert end_points_collection into a end_point dict.
end_points = slim.utils.convert_collection_to_dict(
end_points_collection)
if global_pool:
net = tf.reduce_mean(net, [1, 2], keep_dims=True,name='global_pool')
end_points['global_pool'] = net
if num_classes:
net = slim.dropout(net,dropout_keep_prob,is_training=is_training,scope='dropout7')
net = slim.conv2d(net, num_classes, [1, 1],
activation_fn=None,
normalizer_fn=None,
biases_initializer=tf.zeros_initializer(),
scope='fc8')
if spatial_squeeze:
net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
end_points[sc.name + '/fc8'] = net
return net, end_points
alexnet_v2.default_image_size = 224
and train.py是:
def train_data_generator():
return ds
def validation_data_generator():
return ds
def main(argv=None):
num_classes = 1001
opt_gpu = opt(lr)
opt = hvd.DistributedOptimizer(opt_gpu)
train_dataset = train_data_generator()
validation_dataset = validation_data_generator()
iterator = tf.data.Iterator.from_structure(output_types=train_dataset.output_types, output_shapes=train_dataset.output_shapes)
train_init_op = iterator.make_initializer(train_dataset)
validation_init_op = iterator.make_initializer(validation_dataset)
labels, images, f = iterator.get_next()
images = tf.reshape(images, shape=[batch_size, height, width, 3])
labels = tf.reshape(labels, [batch_size])
is_training = tf.placeholder(tf.bool)
dropout_keep_prob = tf.placeholder(tf.float32)
weight_decay = tf.placeholder(tf.float32)
with slim.arg_scope(alexnet.alexnet_v2_arg_scope(
weight_decay=weight_decay,is_training=is_training)):
pred, _ = alexnet.alexnet_v2(images, num_classes, spatial_squeeze=True, is_training=is_training, dropout_keep_prob=dropout_keep_prob)
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pred, labels=labels, name='cross-entropy'))
l2_loss = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
loss = cross_entropy + l2_loss
pred_soft = tf.nn.softmax(pred)
top_1 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(pred_soft, labels, 1),dtype=tf.float32),name='top_1')
top_5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(pred_soft, labels, 5),dtype=tf.float32),name='top_5')
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies([tf.group(*update_ops)]):
train_step = opt.minimize(loss, global_step=global_steps)
with tf.control_dependencies([train_step,loss,top_1,top_5]):
train_op = tf.no_op(name='train_op')
with tf.control_dependencies([loss,top_1,top_5]):
eval_op = tf.no_op(name='eval_op')
with tf.train.MonitoredTrainingSession(checkpoint_dir = checkpoint_dir,
config = config,
hooks = hooks
) as mon_sess:
global_step = 0
mon_sess._coordinated_creator.tf_sess.run(train_init_op)
while global_step < max_steps:
global_step, _ = mon_sess.run([global_steps, train_op],feed_dict={weight_decay:0.02,is_training:True,dropout_keep_prob:0.5})
mon_sess._coordinated_creator.tf_sess.run(validation_init_op)
t1,t5,el = [],[],[]
for i in range(eval_steps):
_, l, top1,top5 = mon_sess.run([eval_op, loss, top_1, top_5],feed_dict={weight_decay:0.02,is_training:True,dropout_keep_prob:0.5})
t1.append(top1)
t5.append(top5)
el.append(l)
import numpy
ac1 = numpy.mean(t1)
ac5 = numpy.mean(t5)
evalloss = numpy.mean(el)
print('validation done top1 accuracy: %f , top5 accuracy: %f , validation: %f' % (ac1,ac5,evalloss))
我认为我对某些参数做错了,但我找不到。我感谢您的建议。