我无法保存/恢复我训练过的Tensorflow模型。可见的问题是恢复的模型表现不佳,因为它使用随机权重而不是正确权重。其他人遇到过这个问题,但我的根本原因似乎不同,可能是重复的变量名称。
我已经确认检查点文件中的权重与训练模型中使用的权重相同,并且恢复后的权重完全不同。我还看到在保存之前以_1结尾的重复变量。更不寻常的是,恢复后似乎有具有相同名称的重复变量。
我在第一次设置图表之前一直小心运行tf.reset_default_graph(),然后再恢复它。一个潜在的问题可能是我在恢复变量之前运行global_variables_initializer()。如果不这样做,我在运行恢复的模型时会收到未初始化的变量错误。
我正在使用支持GPU的最新版tensorflow。 以下是代码,分为构建,培训/保存和恢复:
#build model
def model_ten_layer_unet(X,y,is_training):
# define our weights (e.g. init_two_layer_convnet)
M = 4
# setup variables
Wconv1 = tf.get_variable("Wconv1", shape=[3, 3, 3, 1, M*32]) #SAME: Never change output size
bconv1 = tf.get_variable("bconv1", shape=[M*32])
Wconv2 = tf.get_variable("Wconv2", shape=[3, 3, 3, M*32, M*32])
bconv2 = tf.get_variable("bconv2", shape=[M*32])
...
#Wconv12= tf.get_variable("Wconv12", shape=[3, 3, 3, 1, M*32]) #SAME: Final layer combines stacked outputs
#bconv12= tf.get_variable("bconv12", shape=[1])
# define our graph (e.g. two_layer_convnet)
d1 = tf.nn.max_pool3d(tf.expand_dims(X,-1), ksize=[1,2,2,2,1], strides=[1,2,2,2,1], padding='VALID')
print('d1 shape: ', d1.shape)
...
a12 = tf.layers.conv3d_transpose(c10, filters=1, kernel_size=[3,3,3], strides=[2,2,2], padding='SAME')
print('a12 shape: ', a12.shape)
y_out = tf.add(tf.squeeze(a12, -1),X)
print('y_out shape: ', y_out.shape)
return y_out
def run_model(session, predict, loss_val, Xd, yd, mean_image,
epochs=1, batch_size=64, print_every=100,
training=None, plot_losses=False):
# have tensorflow compute accuracy
mse = tf.reduce_mean(tf.square(tf.subtract(predict, y)))
accuracy = tf.sqrt(mse)/tf.reduce_mean(tf.cast(mean_image,tf.float32))
# shuffle indicies
train_indicies = np.arange(Xd.shape[0])
np.random.shuffle(train_indicies)
training_now = training is not None
# setting up variables we want to compute (and optimizing)
# if we have a training function, add that to things we compute
# mse is a placeholder
variables = [mean_loss, accuracy, mse]
if training_now:
variables[-1] = training
# counter
iter_cnt = 0
for e in range(epochs):
# keep track of losses and accuracy
losses = []
accuracies = []
# make sure we iterate over the dataset once
for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
# generate indicies for the batch
start_idx = (i*batch_size)%Xd.shape[0]
idx = train_indicies[start_idx:start_idx+batch_size]
# create a feed dictionary for this batch
feed_dict = {X: Xd[idx,:],
y: yd[idx],
is_training: training_now }
# get batch size
actual_batch_size = yd[idx].shape[0]
# have tensorflow compute loss and correct predictions
# and (if given) perform a training step
loss, accuracy, _ = session.run(variables,feed_dict=feed_dict)
# aggregate performance stats
losses.append(loss)
accuracies.append(accuracy)
# print every now and then
if training_now and (iter_cnt % print_every) == 0:
print("Iteration {0}: with minibatch training loss = {1:.3g} and accuracy of {2:.2g}"\
.format(iter_cnt,loss,accuracy))
iter_cnt += 1
total_accuracy = np.mean(accuracies)
total_loss = np.mean(losses)
...
return total_loss,total_accuracy, y_out
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, 64, 64, 64])
y = tf.placeholder(tf.float32, [None, 64, 64, 64])
is_training = tf.placeholder(tf.bool)
y_out = model_ten_layer_unet(X,y,is_training)
total_loss = tf.square(tf.subtract(y_out, y))
mean_loss = tf.reduce_mean(total_loss)
global_step = tf.Variable(0, trainable=False)
boundaries = [5000,10000,15000]
values = [4e-4,2e-4,5e-5,1e-5]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
optimizer = tf.train.AdamOptimizer(learning_rate) # select optimizer and set learning rate
# enable saving and loading models
saver = tf.train.Saver()
#set up path for saving models
home = os.getenv("HOME")
work_dir = 'assignment2'
model_dir = 'models'
model_name = 'unet1'
model_path = os.path.join(home,work_dir,model_dir)
if not os.path.exists(model_path):
os.makedirs(model_path)
model_pathname = os.path.join(model_path,model_name)
# batch normalization in tensorflow requires this extra dependency
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
train_step = optimizer.minimize(mean_loss,global_step=global_step)
#train model
with tf.Session() as sess:
#with tf.device("/gpu:0"): #"/cpu:0" or "/gpu:0"
sess.run(tf.global_variables_initializer())
#print(sess.run('Wconv1:0'))
for e in range(1):
print('Training epoch ', e+1)
run_model(sess,y_out,mean_loss,X_train,y_train,mean_image,1,32,1,train_step,False)
print('Validation epoch ', e+1)
_,_,y_hat = run_model(sess,y_out,mean_loss,X_val,y_val,mean_image,1,32,1)
print('Global Variables')
for v in tf.global_variables():
print(v.name)
print('Local Variables')
for v in tf.local_variables():
print(v.name)
saver.export_meta_graph(model_pathname, clear_devices=True)
save_path = saver.save(sess, model_pathname, global_step=global_step)
print("Model saved in path: %s" % save_path)
#Output
Training epoch 1
Iteration 0: with minibatch training loss = 7.95e+03 and accuracy of 0.2
...
Iteration 27: with minibatch training loss = 7.51e+03 and accuracy of 0.2
Global Variables
Wconv1:0
bconv1:0
Wconv2:0
bconv2:0
Wconv3:0
bconv3:0
conv3d_transpose/kernel:0
conv3d_transpose/bias:0
conv3d_transpose_1/kernel:0
conv3d_transpose_1/bias:0
Variable:0
beta1_power:0
beta2_power:0
Wconv1/Adam:0
Wconv1/Adam_1:0 **Duplicated variables with _1 before saving**
bconv1/Adam:0
bconv1/Adam_1:0
Wconv2/Adam:0
Wconv2/Adam_1:0
bconv2/Adam:0
bconv2/Adam_1:0
Wconv3/Adam:0
Wconv3/Adam_1:0
bconv3/Adam:0
bconv3/Adam_1:0
conv3d_transpose/kernel/Adam:0
conv3d_transpose/kernel/Adam_1:0
...
conv3d_transpose_1/bias/Adam:0
conv3d_transpose_1/bias/Adam_1:0
#restore model
tf.reset_default_graph()
# Later, launch the model, use the saver to restore variables from disk, and
# do some work with the model.
with tf.Session() as sess:
X = tf.placeholder(tf.float32, [None, 64, 64, 64])
y = tf.placeholder(tf.float32, [None, 64, 64, 64])
is_training = tf.placeholder(tf.bool)
y_out = model_ten_layer_unet(X,y,is_training)
total_loss = tf.square(tf.subtract(y_out, y))
mean_loss = tf.reduce_mean(total_loss)
sess.run(tf.global_variables_initializer())
new_saver = tf.train.import_meta_graph(save_path+'.meta')
# Restore variables from disk.
new_saver.restore(sess, save_path)
print("Model restored.")
# Load output images
print('Global Variables')
for v in tf.global_variables():
print(v.name)
print('Local Variables')
for v in tf.local_variables():
print(v.name)
_,_,y_hat = run_model(sess,y_out,mean_loss,X_val,y_val,mean_image,1,32,1)
y_hat = y_hat.eval(feed_dict = {X:X_val[0:9,],is_training:False})
#Output
d1 shape: (?, 32, 32, 32, 1)
...
y_out shape: (?, 64, 64, 64)
Global Variables
Wconv1:0
bconv1:0
Wconv2:0
bconv2:0
Wconv3:0
bconv3:0
conv3d_transpose/kernel:0
conv3d_transpose/bias:0
conv3d_transpose_1/kernel:0
conv3d_transpose_1/bias:0
Wconv1:0 **Repeated variables with the same name**
bconv1:0
Wconv2:0
bconv2:0
Wconv3:0
bconv3:0
conv3d_transpose/kernel:0
conv3d_transpose/bias:0
conv3d_transpose_1/kernel:0
conv3d_transpose_1/bias:0
Variable:0
beta1_power:0
beta2_power:0
Wconv1/Adam:0
Wconv1/Adam_1:0
bconv1/Adam:0
bconv1/Adam_1:0
Wconv2/Adam:0
Wconv2/Adam_1:0
bconv2/Adam:0
bconv2/Adam_1:0
Wconv3/Adam:0
Wconv3/Adam_1:0
bconv3/Adam:0
bconv3/Adam_1:0
conv3d_transpose/kernel/Adam:0
conv3d_transpose/kernel/Adam_1:0
...
conv3d_transpose_1/bias/Adam:0
conv3d_transpose_1/bias/Adam_1:0
Local Variables
Epoch 1, Overall loss = 9.01e+03 and accuracy of 0.214 **Poor validation performance**
答案 0 :(得分:1)
您发布的代码很多,似乎需要花费大量时间才能完成。 以下是如何恢复最新检查点的简单示例:
with tf.Graph().as_default():
x = tf.placeholder(tf.float32, shape=[None, 784])
output = inference(x) # custom function
saver = tf.train.Saver()
sess = tf.Session()
base_dir = os.path.dirname(os.path.abspath(__file__))
checkpoint_path = os.path.join(os.path.dirname(base_dir), "logistic_regression/logistic_logs/")
saver.restore(sess, tf.train.latest_checkpoint(checkpoint_path))
result = sess.run(tf.argmax(output,1), feed_dict={x: conv_mnist(img_name)})
print("Result:", result)
return result
我希望此代码示例可以为您提供帮助。