我在使用带有constant_initializer的Estimator API时遇到了一些麻烦。最初,我试图从.npy文件加载模型权重,但评估损失似乎根本没有移动。
我做了一个似乎有同样问题的小例子。当我用任何其他随机初始化程序替换constant_initializer时,它似乎工作。谁能解释一下发生了什么?
以下是代码的主要部分:
# Big thanks to https://medium.com/onfido-tech/higher-level-apis-in-tensorflow-67bfb602e6c0
import os
import tensorflow as tf
from tensorflow.contrib.learn import ModeKeys
from tensorflow.contrib.learn import learn_runner
from fcn import fcn32_vgg
from fcn import loss as fcn_loss
import voc_dataset
from voc_to_tfrecord import load_voc_dataset
from test_model import SimpleNet, WeightInitializerHook
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
flag_name='weights_dir', default_value='...',
docstring='Top-level directory where the input data will be looked for.')
tf.app.flags.DEFINE_string(
flag_name='model_dir', default_value='...',
docstring='Output directory for model and training stats.')
tf.app.flags.DEFINE_string(
flag_name='data_dir', default_value='...',
docstring='Directory containing the "voc_segmentation_{train|val}.tfrecord" files.')
def run_experiment(argv=None):
# Define model parameters
params = tf.contrib.training.HParams(
learning_rate=0.002,
n_classes=22,
train_steps=100,
eval_steps=1,
min_eval_frequency=10,
eval_delay_secs=0
)
# Set the run_config and the directory to save the model and stats
run_config = tf.contrib.learn.RunConfig()
run_config = run_config.replace(model_dir=FLAGS.model_dir)
run_config = run_config.replace(tf_random_seed=42)
learn_runner.run(
experiment_fn=experiment_fn,
run_config=run_config, # RunConfig
schedule="train_and_evaluate", # What to run
hparams=params # HParams
)
def experiment_fn(run_config, params):
# You can change a subset of the run_config properties as
run_config = run_config.replace(
save_checkpoints_steps=params.min_eval_frequency)
estimator = tf.estimator.Estimator(
model_fn=model_fn, # First-class function
params=params, # HParams
config=run_config # RunConfig
)
# Setup data loaders
train_input_fn, train_input_hook = voc_dataset.get_inputs(
batch_size=64,
tfrecords_path=os.path.join(FLAGS.data_dir,'voc_segmentation_train.tfrecords'),
name_scope='train_data',
shuffle_and_repeat=True)
eval_input_fn, eval_input_hook = voc_dataset.get_inputs(
batch_size=64,
tfrecords_path=os.path.join(FLAGS.data_dir, 'voc_segmentation_val.tfrecords'),
name_scope='eval_data',
shuffle_and_repeat=False)
# Define the experiment
experiment = tf.contrib.learn.Experiment(
estimator=estimator, # Estimator
train_input_fn=train_input_fn,
eval_input_fn=eval_input_fn,
train_steps=params.train_steps,
eval_steps=params.eval_steps,
min_eval_frequency=params.min_eval_frequency, # Eval frequency
train_monitors=[train_input_hook], # Hooks for training
eval_hooks=[eval_input_hook], # Hooks for evaluation
eval_delay_secs=params.eval_delay_secs,
)
return experiment
def model_fn(features, labels, mode, params):
is_training = mode == ModeKeys.TRAIN
net = SimpleNet()
net.build(features, is_training=is_training)
logits = net.logits
predictions = net.predictions
loss = None
train_op = None
eval_metric_ops = {}
if mode != ModeKeys.INFER:
loss = fcn_loss.loss(logits, labels, params.n_classes)
if mode == ModeKeys.TRAIN:
train_op = get_train_op_fn(loss, params)
tf.summary.image('INPUT' + str(is_training), features, max_outputs=64)
tf.summary.image('OUTPUT' + str(is_training), tf.expand_dims(tf.argmax(predictions, -1) / 22, -1), max_outputs=64)
tf.summary.image('LABELS' + str(is_training), tf.expand_dims(tf.argmax(labels, -1) / 22, -1), max_outputs=64)
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={'result': predictions},
loss=loss,
train_op=train_op,
# eval_metric_ops=eval_metric_ops
)
def get_train_op_fn(loss, params):
return tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.train.get_global_step(),
optimizer=tf.train.AdamOptimizer,
learning_rate=params.learning_rate,
name='optimize_loss',
summaries=['loss']
)
# Run script ##############################################
if __name__ == "__main__":
tf.app.run(
main=train_manual
)
这是架构:
class SimpleNet:
def __init__(self, vgg16_npy_path=None):
pass
def build(self, rgb, is_training=False, debug=False):
k_init = None
if is_training:
k_init = tf.constant_initializer(0.1)
self.conv_1 = tf.layers.conv2d(rgb, 5, (5, 5), activation=tf.nn.elu, padding='same', name='conv1', kernel_initializer=k_init)
self.conv_2 = tf.layers.conv2d(self.conv_1, 10, (5, 5), activation=tf.nn.elu, padding='same', name='conv2', kernel_initializer=k_init)
self.conv_3 = tf.layers.conv2d(self.conv_2, 15, (5, 5), activation=tf.nn.elu, padding='same', name='conv3', kernel_initializer=k_init)
self.conv_4 = tf.layers.conv2d(self.conv_3, 20, (5, 5), activation=tf.nn.elu, padding='same', name='conv4', kernel_initializer=k_init)
self.logits = tf.layers.conv2d(self.conv_4, 22, (5, 5), activation=None, padding='same', name='logits', kernel_initializer=k_init)
with tf.name_scope('softmax'):
self.predictions = tf.nn.softmax(self.logits)
如果我将is_training标志设置为False,那么评估损失似乎会下降。否则,它是完全平坦的。关于为什么会出现这种情况的任何想法?