多GPU塔; ValueError:不支持任何值

时间:2016-06-02 13:27:57

标签: tensorflow

我尝试在jupyter-notebook环境中使用cifar10_train_multiGPU.py的多GPU塔式def到MNNet数据。

- 但是,我不会将movingaveragedecay用于损失和变量

但是,实施这些defs会发生错误,我不知道为什么。 请帮助我理解,如果有,请给我更好的模板,以便完全使用GPU。

以下是错误的来源。

with tf.device('/cpu:0'):
    # Setting optimizer
    optimizer = OPTIMIZER('GD')

    # Calculate the gradients for each model tower.
    total_grads = []
    for i in range(n_GPU):
        with tf.device('/gpu:%d' % i):
            with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                tower_loss = TOWER_LOSS(scope)
                tf.get_variable_scope().reuse_variables()
                tower_grads = optimizer.compute_gradients(tower_loss)
                print(tower_grads)
                total_grads.append(tower_grads)
    grads = AVERAGE_GRADIENTS(total_grads)

    train_op = optimizer.apply_gradients(grads, global_step=global_step)

当我打印(tower_grads)时,第一个设备的结果是好的。 然后,在此之后,出现无。

[(,),(,),(,),(,),(,),(,),(,),(,)] [(None,),(None,),(None,),(None,),(None,),(None,),(None,),(None,),(,),(,),( ,),(,),(,),(,),(,),(,)]

所以,我改变了代码来计算自己范围内的损失。

tower_grads = optimizer.compute_gradients(tower_loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope))

然后,可以运行代码。 但是,损失不会被最小化。

  

Iter:100 Total_loss:2.62306   Iter:200 Total_loss:2.67096   Iter:300 Total_loss:2.82596   Iter:400 Total_loss:2.76608   Iter:500 Total_loss:2.71014   Iter:600 Total_loss:2.79502   Iter:700 Total_loss:2.8309   Iter:800 Total_loss:2.89564   Iter:900 Total_loss:2.75665   Iter:1000 Total_loss:2.75197   Iter:1100 Total_loss:2.90878   Iter:1200 Total_loss:2.6832   Iter:1300 Total_loss:2.80052

以下是完整代码。

x = tf.placeholder(tf.float32, shape=[None, IMAGE_SIZE*IMAGE_SIZE*IMAGE_CHANNELS], name='x_raw')
y_ = tf.placeholder(tf.float32, shape=[None, LABEL_SIZE], name='label')

# Global_step
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

def INFERENCE(_X):
    _x_reshaped = tf.reshape(_X, [-1, IMAGE_SIZE, IMAGE_SIZE, IMAGE_CHANNELS], name='x_reshaped')
    C1 = _CONV(_x_reshaped, 5, 64)
    S1 = _MAXPOOL(C1)
    C2 = _CONV(S1, 5, 128)
    S2 = _MAXPOOL(C2)

    _size = S2.get_shape()[1].value
    _channel_size = S2.get_shape()[3].value
    x_flatten = tf.reshape(S2, [-1, _size*_size*_channel_size])  
    FC3 = tf.nn.relu(_LINEAR(x_flatten, n_out=1024))
    _softmax_linear = _LINEAR(FC3, LABEL_SIZE)
    return _softmax_linear

def _WEIGHT_WITH_WD(_SHAPE):
    _initial = tf.truncated_normal(_SHAPE, stddev=0.05)
    _weight = tf.Variable(_initial, name='weight', trainable=True)
    return _weight

def _BIAS(_SHAPE):
    _initial = tf.constant(0.1, shape=_SHAPE)  # dead cell 방지
    return tf.Variable(_initial, name='bias', trainable=True)

def _CONV(_X, kernel_size, out_channel_size, stride_size=1, wd=WD_CONV):
    _in_channels = _X.get_shape()[3].value  
    _kernel_shape = [kernel_size, kernel_size, _in_channels, out_channel_size]
    _kernel = _WEIGHT_WITH_WD(_kernel_shape)
    _stride = [1, stride_size, stride_size, 1]
    _conv = tf.nn.conv2d(_X, _kernel, _stride, padding='SAME')
    _conv_activated = tf.nn.relu(_conv + _BIAS([out_channel_size]))
    return _conv_activated

def _MAXPOOL(_X, kernel_size=2, stride_size=2):
    _kernel_shape = [1, kernel_size, kernel_size, 1]
    _stride = [1, stride_size, stride_size, 1]
    _max_pool = tf.nn.max_pool(_X, _kernel_shape, _stride, padding='SAME')
    return _max_pool

def _LINEAR(_X, n_out):
    _weight = _WEIGHT_WITH_WD([_X.get_shape()[1].value, n_out])
    _bias = _BIAS([n_out])
    _linear = tf.matmul(_X, _weight) + _bias
    return _linear

def OPTIMIZER(optimizer='GD'):
    _lr = LR
    _optimizer_dic = {'GD':tf.train.GradientDescentOptimizer(_lr), 'ADAM':tf.train.AdamOptimizer(_lr)}
    _opt = _optimizer_dic[optimizer]
    return _opt

def TOTAL_LOSS(logits, labels):
    with tf.name_scope('Loss') as scope:
        _cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, labels, name='cross_entropy_per_example')
        _cross_entropy_mean = tf.reduce_mean(_cross_entropy, name='cross_entropy_per_batch')
        tf.add_to_collection('losses', _cross_entropy_mean)
        _total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
        return _total_loss

def TOWER_LOSS(scope):    
    _logits = INFERENCE(x)
    _ = TOTAL_LOSS(_logits, y_)
    _tower_loss_list = tf.get_collection('losses', scope)
    _tower_loss = tf.add_n(_tower_loss_list, name='tower_loss')

    # Compute the moving average of all individual losses and the total loss.
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    loss_averages_op = loss_averages.apply(_tower_loss_list + [_tower_loss])

    with tf.control_dependencies([loss_averages_op]):
        _tower_loss = tf.identity(_tower_loss)

    return _tower_loss

def AVERAGE_GRADIENTS(total_grads):
    average_grads = []  
    for grad_n_vars in zip(*total_grads): 
        grads = []
        for g, _ in grad_n_vars: 
            expanded_g = tf.expand_dims(g, 0)
            grads.append(expanded_g)
        grad = tf.concat(0, grads)  
        grad = tf.reduce_sum(grad, reduction_indices=0)  
        var = grad_n_vars[0][1] 
        grad_n_var = (grad, var)
        average_grads.append(grad_n_var)
    return average_grads    

with tf.device('/cpu:0'):
    # Setting optimizer
    optimizer = OPTIMIZER('GD')

    # Calculate the gradients for each model tower.
    total_grads = []
    for i in range(n_GPU):
        with tf.device('/gpu:%d' % i):
            with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                tower_loss = TOWER_LOSS(scope)
                tf.get_variable_scope().reuse_variables()
                tower_grads = optimizer.compute_gradients(tower_loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope))
                total_grads.append(tower_grads)
    grads = AVERAGE_GRADIENTS(total_grads)

    train_op = optimizer.apply_gradients(grads, global_step=global_step)

    ## Launch the graph
    with tf.Session(config=CONFIG) as sess:
        sess.run(tf.initialize_all_variables())

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        # training
        for i in range(MAX_ITER):
            if i%PRINT_BY != 0 or i==0:
                tr_x, tr_y_ = mnist.train.next_batch(BATCH_SIZE)
                sess.run(train_op, feed_dict={x:tr_x, y_:tr_y_})

            else:
                ## print
                gstep, tower_loss_ = sess.run([global_step, tower_loss], feed_dict={x:tr_x, y_:tr_y_})
                print("GStep: %d \tIter :%d \tTotal_loss: %g" \
                      %(gstep, i, tower_loss_))

1 个答案:

答案 0 :(得分:2)

从您的输出中,看起来每个塔都在创建一组新的变量:

[(, ), (, ), (, ), (, ), (, ), (, ), (, ), (, )]
[(None, ), (None, ), (None, ), (None, ), (None, ), (None, ), (None, ), (None, ), (, ), (, ), (, ), (, ), (, ), (, ), (, ), (, )]

请注意,第二个塔的输出中有两倍的变量(16对8),前8个变量在第二个塔中没有梯度。

有两种可能的解决方法:

  1. 确保您的TOWER_LOSS()功能包含with tf.variable_scope(...):个阻止。有关示例,请参阅cifar10.inference()函数。您需要创建tf.variable_scope()才能使用变量共享。

  2. 使用tf.get_variable()代替tf.Variable(),以确保尽可能共享变量。以下是在{CIFAR-10中完成此操作的example

  3. 有关共享变量的更多详细信息,请参阅how-to guide