我训练了一个模型并保存了检查点。我的模型的代码是:
with tf.variable_scope(scope):
self.inputs = tf.placeholder(shape=[None, 80, 80, 1], dtype=tf.float32)
self.conv_1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.inputs, num_outputs=32,
kernel_size=[8, 8], stride=4, padding='SAME')
self.conv_2 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.conv_1, num_outputs=64,
kernel_size=[4, 4], stride=2, padding='SAME')
self.conv_3 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.conv_2, num_outputs=64,
kernel_size=[3, 3], stride=1, padding='SAME')
self.fc = slim.fully_connected(slim.flatten(self.conv_3), 512, activation_fn=tf.nn.elu)
# Output layers for policy and value estimations
self.policy = slim.fully_connected(self.fc,
cfg.ACTION_DIM,
activation_fn=tf.nn.softmax,
biases_initializer=None)
self.value = slim.fully_connected(self.fc,
1,
activation_fn=None,
biases_initializer=None)
大约有32个进程同时运行,每个进程都有上述代码中定义的全局网络的副本,scope
是每个进程的id。全球网络scope
为global
。
之后,我想在self.fc
图层后添加更多图层。
with tf.variable_scope(scope):
self.inputs = tf.placeholder(shape=[None, 80, 80, 1], dtype=tf.float32)
self.conv_1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.inputs, num_outputs=32,
kernel_size=[8, 8], stride=4, padding='SAME')
self.conv_2 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.conv_1, num_outputs=64,
kernel_size=[4, 4], stride=2, padding='SAME')
self.conv_3 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.conv_2, num_outputs=64,
kernel_size=[3, 3], stride=1, padding='SAME')
self.fc = slim.fully_connected(slim.flatten(self.conv_3), 512, activation_fn=tf.nn.elu)
# Output layers for policy and value estimations
self.policy = slim.fully_connected(self.fc,
cfg.ACTION_DIM,
activation_fn=tf.nn.softmax,
biases_initializer=None)
self.value = slim.fully_connected(self.fc,
1,
activation_fn=None,
biases_initializer=None)
self.new_fc_1 = slim.fully_connected(self.fc, 512, activation_fn=tf.nn.elu)
但是,当我恢复模型时,它报告了以下错误:
2017-08-03 22:23:43.473157: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key worker_15/fully_connected_3/weights not found in checkpoint
2017-08-03 22:23:43.477197: W tensorflow/core/framework/op_kernel.cc:1158] Data loss: Checksum does not match: stored 379803423 vs. calculated on the restored bytes 2648422677
2017-08-03 22:23:43.477210: W tensorflow/core/framework/op_kernel.cc:1158] Data loss: Checksum does not match: stored 3963326522 vs. calculated on the restored bytes 3154501583
2017-08-03 22:23:43.477200: W tensorflow/core/framework/op_kernel.cc:1158] Data loss: Checksum does not match: stored 3893236466 vs. calculated on the restored bytes 1767411214
2017-08-03 22:23:43.478276: W tensorflow/core/framework/op_kernel.cc:1158] Data loss: Checksum does not match: stored 4239176201 vs. calculated on the restored bytes 3213118706
2017-08-03 22:23:43.480438: W tensorflow/core/framework/op_kernel.cc:1158] Data loss: Checksum does not match: stored 442335910 vs. calculated on the restored bytes 4248164641
2017-08-03 22:23:43.483885: W tensorflow/core/framework/op_kernel.cc:1158] Data loss: Checksum does not match: stored 3105262865 vs. calculated on the restored bytes 2648422677
2017-08-03 22:23:43.483953: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key worker_15/fully_connected_3/weights not found in checkpoint
[[Node: save/RestoreV2_128 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save/Const_0_0, save/RestoreV2_128/tensor_names, save/RestoreV2_128/shape_and_slices)]]
2017-08-03 22:23:43.486987: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key worker_15/fully_connected_3/weights not found in checkpoint
[[Node: save/RestoreV2_128 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save/Const_0_0, save/RestoreV2_128/tensor_names, save/RestoreV2_128/shape_and_slices)]]
2017-08-03 22:23:43.490616: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key worker_15/fully_connected_3/weights not found in checkpoint
[[Node: save/RestoreV2_128 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save/Const_0_0, save/RestoreV2_128/tensor_names, save/RestoreV2_128/shape_and_slices)]]
2017-08-03 22:23:43.491951: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key worker_15/fully_connected_3/weights not found in checkpoint
[[Node: save/RestoreV2_128 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save/Const_0_0, save/RestoreV2_128/tensor_names, save/RestoreV2_128/shape_and_slices)]]
2017-08-03 22:23:43.491957: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key worker_15/fully_connected_3/weights not found in checkpoint
[[Node: save/RestoreV2_128 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save/Const_0_0, save/RestoreV2_128/tensor_names, save/RestoreV2_128/shape_and_slices)]]
2017-08-03 22:23:43.494310: W tensorflow/core/framework/op_kernel.cc:1158] Not found: Key worker_15/fully_connected_3/weights not found in checkpoint
[[Node: save/RestoreV2_128 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save/Const_0_0, save/RestoreV2_128/tensor_names, save/RestoreV2_128/shape_and_slices)]]
.... ....
我使用以下代码保存模型
saver.save(sess, self.model_path+'/model-'+str(episode_count)+'.ckpt')
这是定义保护程序的代码
value_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global/old_scope')
value_list.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global/actor_critic'))
saver = tf.train.Saver(value_list, max_to_keep=100)
with tf.Session(config=tf_configs) as sess:
coord = tf.train.Coordinator()
if load_model:
print('Loading Model...')
ckpt = tf.train.get_checkpoint_state(model_path)
saver.restore(sess, ckpt.model_checkpoint_path)
else:
sess.run(tf.global_variables_initializer())
如果将一些带有随机初始化参数的新图层添加到当前神经网络中,如何恢复预先训练的模型?
答案 0 :(得分:1)
您可以使用两个单独的变量范围。一个用于保存和加载,一个用于新层。
然后,您可以指定保护程序仅使用第一个范围中的变量:
saver = tf.train.Saver(
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="save_scope")
)
答案 1 :(得分:0)
在谷歌搜索了很长时间后,在@BlueSun的帮助下,我发现以下方法可以帮助解决这个问题。
在添加新范围之前,首先使用当前范围的变量来保存模型。
value_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global/old_scope')
value_list.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global/actor_critic'))
saver = tf.train.Saver(value_list, max_to_keep=100)
培养新工作。
稍后,在运行模型之前添加新范围并定义新的saver
,代码如下
value_list = []
value_list.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global/old_scope'))
value_list.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global/actor_critic'))
value_list.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global/added_layer'))
saver = tf.train.Saver(value_list, max_to_keep=100)
with tf.Session(config=tf_configs) as sess:
coord = tf.train.Coordinator()
if load_model:
print('Loading Model...')
ckpt = tf.train.get_checkpoint_state(model_path)
saver.restore(sess, ckpt.model_checkpoint_path)
else:
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="global"), max_to_keep=100)
,网络代码如下所示
with tf.variable_scope(scope):
with tf.variable_scope('old_scope'):
self.inputs = tf.placeholder(shape=[None, 80, 80, 1], dtype=tf.float32)
self.conv_1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.inputs, num_outputs=32,
kernel_size=[8, 8], stride=4, padding='SAME')
self.conv_2 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.conv_1, num_outputs=64,
kernel_size=[4, 4], stride=2, padding='SAME')
self.conv_3 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.conv_2, num_outputs=64,
kernel_size=[3, 3], stride=1, padding='SAME')
self.fc = slim.fully_connected(slim.flatten(self.conv_3), 512, activation_fn=tf.nn.elu)
with tf.variable_scope('added_layer'):
self.fc_1 = slim.fully_connected(self.fc, 512, activation_fn=tf.nn.elu)
with tf.variable_scope('actor_critic'):
# Output layers for policy and value estimations
self.policy = slim.fully_connected(self.fc_1,
cfg.ACTION_DIM,
activation_fn=tf.nn.softmax,
biases_initializer=None)
self.value = slim.fully_connected(self.fc_1,
1,
activation_fn=None,
biases_initializer=None)
现在它工作正常,虽然代码看起来有点不雅。