使用tf.data api和自定义优化器时运行Tensorflow模型错误

时间:2018-10-08 16:57:14

标签: python tensorflow

在以下代码中,我想在张量流中训练模型。该模型是一个ResNet模型,是一个深层模型,因此该批处理应该很小,以便数据/所有激活都适合内存。由于这个原因,我实现了一个自定义优化器,该优化器可以在不同的进料微型批次上累积梯度,最后一次应用梯度下降。另外,我使用了tf.data api从创建的tfrecords中获取数据。请注意,我的输入数据是视频帧;所使用的检测到的变量表示在特定帧中是否检测到面部。因此,检测到的仅用于MSE(仅供说明)。

import tensorflow as tf
import numpy as np
import csv
import os

num_epoch = 100
latent_dim = 100
cell_size = 100

# for each input frame, I have 3 outputs.
num_classes = 3

common = "C:/Users/user/Documents/SEWA_db/tfrecords_db/"

filenames_train = []
filenames_dev = []

for i in range(1, 35):
    filenames_train.append(common + "Train_DE_{num:02d}.tfrecords".format(num=i))

for i in range(1, 15):
    filenames_dev.append(common + "Devel_DE_{num:02d}.tfrecords".format(num=i))

phase_train = tf.placeholder_with_default(True, shape=(), name='phase')

train_batch_size = 5
test_batch_size = 5

tf.set_random_seed(123)
mseed = 123

# this method is used within the model()...
def create_variables(name, shape, initializer=tf.contrib.layers.xavier_initializer(), weight_decay=0.0001):
    '''
    :param name: A string. The name of the new variable
    :param shape: A list of dimensions
    :param initializer: User Xavier as default.
    :param is_fc_layer: Want to create fc layer variable? May use different weight_decay for fc
    layers.
    :return: The created variable
    '''

    ## TODO: to allow different weight decay to fully connected layer and conv layer
    regularizer = tf.contrib.layers.l2_regularizer(scale=weight_decay)

    new_variables = tf.get_variable(name, shape=shape, initializer=initializer,
                                    regularizer=regularizer)
    return new_variables

def model(inputs, n):
    ....
    # predictions shape: (batch_size, 3)
    return predictions

# loss function:
summaries_while_testing = []
summaries_while_training = []

def loss(predictions, labels, detected, name_scope, train_test):
    # MSE
    with tf.name_scope(name_scope):

        MSE = tf.square(tf.subtract(predictions, labels))
        MSE = tf.boolean_mask(MSE, detected)
        MSE = tf.reduce_mean(MSE)

        if train_test == 'Train':
            reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            MSE += tf.reduce_sum(reg_losses)
            loss_s = tf.summary.scalar('MSE', MSE)
            summaries_while_training.append(loss_s)
        else:
            loss_s = tf.summary.scalar('MSE', MSE)
            summaries_while_testing.append(loss_s)

    return MSE

# optimizer:
def optimize(mse):
     with tf.name_scope('Optimizer'):
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_ops):
            optimizer = tf.train.AdamOptimizer(learning_rate=0.001)

            trainable_variables = tf.trainable_variables()

            accum_vars = [tf.Variable(tf.zeros_like(single_tr_variable.value()), trainable=False)
                          for single_tr_variable in trainable_variables]

            # This is used as a rest mode between different training iterations...
            zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]

            grads_vars = optimizer.compute_gradients(mse, trainable_variables)

            accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(grads_vars) if gv[0] is not None]
            train_step = optimizer.apply_gradients([(accum_vars[i], gv[1]) for i, gv in enumerate(grads_vars)])

            return train_step, accum_ops, zero_ops

# retrieve data section
def _parse_function(example_proto):

    # The annotation contains the following features: timestamp; arousal; valence; liking
    features = {
        'height': tf.FixedLenFeature([], tf.int64),
        'width': tf.FixedLenFeature([], tf.int64),
        'image_raw': tf.FixedLenFeature([], tf.string),
        'frame_number': tf.FixedLenFeature([1], tf.int64),
        'detected': tf.FixedLenFeature([1], tf.int64),
        'arousal': tf.FixedLenFeature([1], tf.float32),
        'valence': tf.FixedLenFeature([1], tf.float32),
        'liking': tf.FixedLenFeature([1], tf.float32)
    }

    parsed_features = tf.parse_single_example(example_proto, features)

    # This is how we create one example, that is, extract one example from the database.
    image = tf.decode_raw(parsed_features['image_raw'], tf.uint8)
    # The height and the weights are used to
    height = tf.cast(parsed_features['height'], tf.int32)
    width = tf.cast(parsed_features['width'], tf.int32)

    # The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the
    # height and the weight to restore the original image back.
    # Tensor("Reshape:0", shape=(112, 112, 3), dtype=uint8)
    image = tf.reshape(image, [112, 112, 3])

    detected = parsed_features['detected']
    arousal = parsed_features['arousal']
    valence = parsed_features['valence']
    liking = parsed_features['liking']

    return detected, arousal, valence, liking, image

###############################      TRAINING      ###################################

datasets_train_iterators = []

for file_name in filenames_train:
    dataset_train = tf.data.TFRecordDataset(file_name).map(_parse_function).batch(train_batch_size)
    datasets_train_iterators.append(dataset_train)

dataset_train_all = tf.data.Dataset.zip(tuple(datasets_train_iterators))
iterator_train_all = dataset_train_all.make_initializable_iterator()

def retrieve_inputs_train():

    next_batch = iterator_train_all.get_next()

    detected = []
    arousal = []
    valence = []
    liking = []
    images = []

    for n in next_batch:
        detected.append(n[0])
        arousal.append(n[1])
        valence.append(n[2])
        liking.append(n[3])
        images.append(n[4])

    detected = tf.concat(detected, axis=0)
    arousal = tf.concat(arousal, axis=0)
    valence = tf.concat(valence, axis=0)
    liking = tf.concat(liking, axis=0)
    images = tf.concat(images, axis=0)

    return detected, arousal, valence, liking, images

###############################      TESTING      ###################################
datasets_dev_iterators = []

for file_name in filenames_dev:
    dataset_dev = tf.data.TFRecordDataset(file_name).map(_parse_function).batch(test_batch_size)
    datasets_dev_iterators.append(dataset_dev)

dataset_dev_all = tf.data.Dataset.zip(tuple(datasets_dev_iterators))
iterator_dev_all = dataset_dev_all.make_initializable_iterator()

def retrieve_inputs_dev():

    next_batch = iterator_dev_all.get_next()

    detected = []
    arousal = []
    valence = []
    liking = []
    images = []

    for n in next_batch:
        detected.append(n[0])
        arousal.append(n[1])
        valence.append(n[2])
        liking.append(n[3])
        images.append(n[4])

    detected = tf.concat(detected, axis=0)
    arousal = tf.concat(arousal, axis=0)
    valence = tf.concat(valence, axis=0)
    liking = tf.concat(liking, axis=0)
    images = tf.concat(images, axis=0)

    return detected, arousal, valence, liking, images

# preparing model before training
detected, arousal, valence, liking, images = tf.cond(phase_train,
                                                     lambda: retrieve_inputs_train(),
                                                     lambda: retrieve_inputs_dev())

images_casted = tf.cast(images, tf.float32)
with tf.name_scope('image_normal'):
    images_casted_normalized = tf.map_fn(lambda img: tf.image.per_image_standardization(img), images_casted)

# shape of predictions: (680, 3) -> 3 since we are outputing arousal, valence and liking
# the n parameter is for Resnet configuration... Not important for now
predictions = model(images_casted_normalized, n=[3, 4, 6, 3])

predicted_arousal = tf.slice(predictions, begin=[0, 0], size=[-1, 1], name='predicted_arousal')
predicted_valence = tf.slice(predictions, begin=[0, 1], size=[-1, 1], name='predicted_valence')
predicted_liking = tf.slice(predictions, begin=[0, 2], size=[-1, 1], name='predicted_liking')

MSE_a = tf.cond(phase_train, 
                lambda: loss(predicted_arousal, arousal, detected, 'MSE_arousal_Train', 'Train'),
                lambda: loss(predicted_arousal, arousal, detected, 'MSE_arousal_Devel', 'Devel'))

MSE_v = tf.cond(phase_train, 
                lambda: loss(predicted_valence, valence, detected, 'MSE_valence_Train', 'Train'),
                lambda: loss(predicted_valence, valence, detected, 'MSE_valence_Devel', 'Devel'))

MSE_l = tf.cond(phase_train,
                lambda: loss(predicted_liking, liking, detected, 'MSE_liking_Train', 'Train'),
                lambda: loss(predicted_liking, liking, detected, 'MSE_liking_Devel', 'Devel'))

MSE = MSE_a + MSE_v + MSE_l

train_step, accum_ops, zero_ops = optimize(MSE)

init_op = tf.global_variables_initializer()

model_path = "C:/Users/user/Documents/f24/model"
events_path = "C:/Users/user/Documents/f24/event_files/34_layers"
with tf.Session() as sess:

    sess.run(init_op)

    train_writer = tf.summary.FileWriter(events_path, sess.graph)

    merged_train = tf.summary.merge(summaries_while_training)
    merged_val = tf.summary.merge(summaries_while_testing)

    sess.run(iterator_train_all.initializer)
    sess.run(iterator_dev_all.initializer)

最后,出现以下错误:

FailedPreconditionError: Attempting to use uninitialized value conv3_1/conv2_in_block/conv
     [[Node: conv3_1/conv2_in_block/conv/read = Identity[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](conv3_1/conv2_in_block/conv)]]

During handling of the above exception, another exception occurred:

FailedPreconditionError                   Traceback (most recent call last)
<ipython-input-11-dbe6d12c67ce> in <module>()
      7 
      8     for v in accum_vars:
----> 9         sess.run(v.initializer)
     10 
     11     sess.run(init_op)

...

File "<ipython-input-10-8d7d7b4aa814>", line 10, in <module>
    predictions = model(images_casted_normalized, n=[3, 4, 6, 3])
  File "<ipython-input-5-fae307f9536f>", line 25, in model
    conv3 = residual_block(layers[-1], 256, is_training=phase_train)
  File "<ipython-input-4-d8a2d1403f18>", line 97, in residual_block
    conv2 = bn_relu_conv_layer(conv1, [3, 3, output_channel, output_channel], 1, is_training=is_training)
  File "<ipython-input-4-d8a2d1403f18>", line 61, in bn_relu_conv_layer
    filter = create_variables(name='conv', shape=filter_shape)
  File "<ipython-input-4-d8a2d1403f18>", line 15, in create_variables
    regularizer=regularizer)
  File "C:\Users\user\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1317, in get_variable
    constraint=constraint)

现在,当我在optimize()中删除这两行时,我的代码可以正常工作,但是我知道这是错误的。

update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies(update_ops):

或者,如果我将以下代码用于优化程序,则我的代码可以正常运行。

def optimize(mse):
     with tf.name_scope('Optimizer'):
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_ops):
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            train_step = optimizer.minimize(mse)
            return train_step

对我来说,这很奇怪和奇怪。我很想知道出现错误的原因。

非常感谢您的帮助!

0 个答案:

没有答案