Question

我无法保存/恢复我训练过的Tensorflow模型。可见的问题是恢复的模型表现不佳，因为它使用随机权重而不是正确权重。其他人遇到过这个问题，但我的根本原因似乎不同，可能是重复的变量名称。

我已经确认检查点文件中的权重与训练模型中使用的权重相同，并且恢复后的权重完全不同。我还看到在保存之前以_1结尾的重复变量。更不寻常的是，恢复后似乎有具有相同名称的重复变量。

我在第一次设置图表之前一直小心运行tf.reset_default_graph（），然后再恢复它。一个潜在的问题可能是我在恢复变量之前运行global_variables_initializer（）。如果不这样做，我在运行恢复的模型时会收到未初始化的变量错误。

我正在使用支持GPU的最新版tensorflow。以下是代码，分为构建，培训/保存和恢复：

#build model
def model_ten_layer_unet(X,y,is_training):
    # define our weights (e.g. init_two_layer_convnet)
    M = 4

    # setup variables
    Wconv1 = tf.get_variable("Wconv1", shape=[3, 3, 3, 1, M*32]) #SAME: Never change output size
    bconv1 = tf.get_variable("bconv1", shape=[M*32])
    Wconv2 = tf.get_variable("Wconv2", shape=[3, 3, 3, M*32, M*32])
    bconv2 = tf.get_variable("bconv2", shape=[M*32])
    ...
    #Wconv12= tf.get_variable("Wconv12", shape=[3, 3, 3, 1, M*32]) #SAME: Final layer combines stacked outputs
    #bconv12= tf.get_variable("bconv12", shape=[1])

    # define our graph (e.g. two_layer_convnet)
    d1 = tf.nn.max_pool3d(tf.expand_dims(X,-1), ksize=[1,2,2,2,1], strides=[1,2,2,2,1], padding='VALID')
    print('d1 shape: ', d1.shape)
    ...
    a12 = tf.layers.conv3d_transpose(c10, filters=1, kernel_size=[3,3,3], strides=[2,2,2], padding='SAME')
    print('a12 shape: ', a12.shape)
    y_out = tf.add(tf.squeeze(a12, -1),X)
    print('y_out shape: ', y_out.shape)
    return y_out


def run_model(session, predict, loss_val, Xd, yd, mean_image,
              epochs=1, batch_size=64, print_every=100,
              training=None, plot_losses=False):

    # have tensorflow compute accuracy
    mse = tf.reduce_mean(tf.square(tf.subtract(predict, y)))
    accuracy = tf.sqrt(mse)/tf.reduce_mean(tf.cast(mean_image,tf.float32))

    # shuffle indicies
    train_indicies = np.arange(Xd.shape[0])
    np.random.shuffle(train_indicies)

    training_now = training is not None

    # setting up variables we want to compute (and optimizing)
    # if we have a training function, add that to things we compute
    # mse is a placeholder
    variables = [mean_loss, accuracy, mse]
    if training_now:
        variables[-1] = training

    # counter 
    iter_cnt = 0
    for e in range(epochs):
        # keep track of losses and accuracy
        losses = []
        accuracies = []
        # make sure we iterate over the dataset once
        for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
            # generate indicies for the batch
            start_idx = (i*batch_size)%Xd.shape[0]
            idx = train_indicies[start_idx:start_idx+batch_size]

            # create a feed dictionary for this batch
            feed_dict = {X: Xd[idx,:],
                         y: yd[idx],
                         is_training: training_now }
            # get batch size
            actual_batch_size = yd[idx].shape[0]

            # have tensorflow compute loss and correct predictions
            # and (if given) perform a training step
            loss, accuracy, _ = session.run(variables,feed_dict=feed_dict)

            # aggregate performance stats
            losses.append(loss)
            accuracies.append(accuracy)

            # print every now and then
            if training_now and (iter_cnt % print_every) == 0:
                print("Iteration {0}: with minibatch training loss = {1:.3g} and accuracy of {2:.2g}"\
                      .format(iter_cnt,loss,accuracy))
            iter_cnt += 1
        total_accuracy = np.mean(accuracies)
        total_loss = np.mean(losses)
        ...
    return total_loss,total_accuracy, y_out


    tf.reset_default_graph()

    X = tf.placeholder(tf.float32, [None, 64, 64, 64])
    y = tf.placeholder(tf.float32, [None, 64, 64, 64])
    is_training = tf.placeholder(tf.bool)

    y_out = model_ten_layer_unet(X,y,is_training)
    total_loss = tf.square(tf.subtract(y_out, y))
    mean_loss = tf.reduce_mean(total_loss)

    global_step = tf.Variable(0, trainable=False)
    boundaries = [5000,10000,15000]
    values = [4e-4,2e-4,5e-5,1e-5]
    learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)

    optimizer = tf.train.AdamOptimizer(learning_rate) # select optimizer and set learning rate

    # enable saving and loading models
    saver = tf.train.Saver()

    #set up path for saving models
    home = os.getenv("HOME")
    work_dir = 'assignment2'
    model_dir = 'models'
    model_name = 'unet1'
    model_path = os.path.join(home,work_dir,model_dir)
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    model_pathname = os.path.join(model_path,model_name)

    # batch normalization in tensorflow requires this extra dependency
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(extra_update_ops):
        train_step = optimizer.minimize(mean_loss,global_step=global_step)


#train model
    with tf.Session() as sess:
        #with tf.device("/gpu:0"): #"/cpu:0" or "/gpu:0" 
            sess.run(tf.global_variables_initializer())
            #print(sess.run('Wconv1:0'))

            for e in range(1):
                print('Training epoch ', e+1)
                run_model(sess,y_out,mean_loss,X_train,y_train,mean_image,1,32,1,train_step,False)
                print('Validation epoch ', e+1)
                _,_,y_hat = run_model(sess,y_out,mean_loss,X_val,y_val,mean_image,1,32,1)

            print('Global Variables')
            for v in tf.global_variables():
                print(v.name)
            print('Local Variables')
            for v in tf.local_variables():
                print(v.name)

            saver.export_meta_graph(model_pathname, clear_devices=True)
            save_path = saver.save(sess, model_pathname, global_step=global_step)
            print("Model saved in path: %s" % save_path)

    #Output
    Training epoch  1
    Iteration 0: with minibatch training loss = 7.95e+03 and accuracy of 0.2
    ...
    Iteration 27: with minibatch training loss = 7.51e+03 and accuracy of 0.2
    Global Variables
    Wconv1:0
    bconv1:0
    Wconv2:0
    bconv2:0
    Wconv3:0
    bconv3:0
    conv3d_transpose/kernel:0
    conv3d_transpose/bias:0
    conv3d_transpose_1/kernel:0
    conv3d_transpose_1/bias:0
    Variable:0
    beta1_power:0
    beta2_power:0
    Wconv1/Adam:0
    Wconv1/Adam_1:0 **Duplicated variables with _1 before saving**
    bconv1/Adam:0
    bconv1/Adam_1:0
    Wconv2/Adam:0
    Wconv2/Adam_1:0
    bconv2/Adam:0
    bconv2/Adam_1:0
    Wconv3/Adam:0
    Wconv3/Adam_1:0
    bconv3/Adam:0
    bconv3/Adam_1:0
    conv3d_transpose/kernel/Adam:0
    conv3d_transpose/kernel/Adam_1:0
    ...
    conv3d_transpose_1/bias/Adam:0
    conv3d_transpose_1/bias/Adam_1:0



#restore model    
    tf.reset_default_graph()

    # Later, launch the model, use the saver to restore variables from disk, and
    # do some work with the model.
    with tf.Session() as sess:

        X = tf.placeholder(tf.float32, [None, 64, 64, 64])
        y = tf.placeholder(tf.float32, [None, 64, 64, 64])
        is_training = tf.placeholder(tf.bool)
        y_out = model_ten_layer_unet(X,y,is_training)
        total_loss = tf.square(tf.subtract(y_out, y))
        mean_loss = tf.reduce_mean(total_loss)

        sess.run(tf.global_variables_initializer())

        new_saver = tf.train.import_meta_graph(save_path+'.meta')

        # Restore variables from disk.
        new_saver.restore(sess, save_path)
        print("Model restored.")

        # Load output images
        print('Global Variables')
        for v in tf.global_variables():
            print(v.name)
        print('Local Variables')
        for v in tf.local_variables():
            print(v.name)
        _,_,y_hat = run_model(sess,y_out,mean_loss,X_val,y_val,mean_image,1,32,1)
        y_hat = y_hat.eval(feed_dict = {X:X_val[0:9,],is_training:False})


        #Output
        d1 shape:  (?, 32, 32, 32, 1)
        ...
        y_out shape:  (?, 64, 64, 64)
        Global Variables
        Wconv1:0
        bconv1:0
        Wconv2:0
        bconv2:0
        Wconv3:0
        bconv3:0
        conv3d_transpose/kernel:0
        conv3d_transpose/bias:0
        conv3d_transpose_1/kernel:0
        conv3d_transpose_1/bias:0
        Wconv1:0 **Repeated variables with the same name**
        bconv1:0
        Wconv2:0
        bconv2:0
        Wconv3:0
        bconv3:0
        conv3d_transpose/kernel:0
        conv3d_transpose/bias:0
        conv3d_transpose_1/kernel:0
        conv3d_transpose_1/bias:0
        Variable:0
        beta1_power:0
        beta2_power:0
        Wconv1/Adam:0
        Wconv1/Adam_1:0
        bconv1/Adam:0
        bconv1/Adam_1:0
        Wconv2/Adam:0
        Wconv2/Adam_1:0
        bconv2/Adam:0
        bconv2/Adam_1:0
        Wconv3/Adam:0
        Wconv3/Adam_1:0
        bconv3/Adam:0
        bconv3/Adam_1:0
        conv3d_transpose/kernel/Adam:0
        conv3d_transpose/kernel/Adam_1:0
        ...
        conv3d_transpose_1/bias/Adam:0
        conv3d_transpose_1/bias/Adam_1:0
        Local Variables
        Epoch 1, Overall loss = 9.01e+03 and accuracy of 0.214 **Poor validation performance**

Answer 1

您发布的代码很多，似乎需要花费大量时间才能完成。以下是如何恢复最新检查点的简单示例：

 with tf.Graph().as_default():
    x = tf.placeholder(tf.float32, shape=[None, 784])
    output = inference(x)      # custom function
    saver = tf.train.Saver()
    sess = tf.Session()
    base_dir = os.path.dirname(os.path.abspath(__file__))
    checkpoint_path = os.path.join(os.path.dirname(base_dir), "logistic_regression/logistic_logs/")
    saver.restore(sess, tf.train.latest_checkpoint(checkpoint_path))

    result = sess.run(tf.argmax(output,1), feed_dict={x: conv_mnist(img_name)})
    print("Result:", result)
    return result

我希望此代码示例可以为您提供帮助。

Tensorflow：保存/恢复变量加载不正确

1 个答案: