ResourceExhaustedError - 张量流中的内存不足

时间:2017-09-16 20:35:38

标签: tensorflow neural-network nvidia conv-neural-network tensorflow-gpu

以下是我对网络的alexnet的代码片段(此处输入为224 * 224灰度)。我收到下面发布的资源耗尽错误,我不确定为什么

import tensorflow as tf
from IPython import embed
import pre_process_data as dataProc
import numpy as np

#Converting RGB to Gray
from skimage import color

#Parameters
learning_rate = 0.001
training_iters = 200000
batch_size = 4 
display_step = 20

#Network Parameters
n_input = [224,224,3] # (img 224,224,3)
n_classes = 7 # (0-7)
dropout = 0.8 # Dropout

def ReturnOneHotVector(labels):
    labels = labels.astype(np.int32)
    sparse_labels = tf.reshape(labels, [-1, 1]) 
    derived_size = tf.shape(sparse_labels)[0]
    indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1]) 
    concated = tf.concat( [indices, sparse_labels],axis=1)
    outshape = tf.concat([tf.reshape(derived_size, [1]), tf.reshape(n_classes, [1])],axis=0)
    one_hot_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0)
    return one_hot_labels

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1,shape=shape)
    return initial

def conv2d(x,W):
    return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

#Build Graph
def deepnn(x_image):
    '''x - input [none, 224*224*3]
    returns y([none, 10], scalar keep_prob'''

    with tf.name_scope('reshape'):
        x_image = tf.reshape(x_image,[-1,224,224,1])

    with tf.name_scope('conv1'):
        W_conv1 = weight_variable([5,5,1,32])
        b_conv1 = bias_variable([32])
        h_conv1 = tf.nn.relu(conv2d(x_image,W_conv1)+ b_conv1)   

    # Pooling layer - downsamples by 2X.
    with tf.name_scope('pool1'):
        h_pool1 = max_pool_2x2(h_conv1)

    with tf.name_scope('conv2'):
        W_conv2 = weight_variable([5, 5, 32, 64])
        b_conv2 = bias_variable([64])
        h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)

    # Second pooling layer.
    with tf.name_scope('pool2'):
        h_pool2 = max_pool_2x2(h_conv2)

    #FC layer
    with tf.name_scope('fc1'):
        W_fc1 = weight_variable([56 * 56 * 64, 1024])
        b_fc1 = bias_variable([1024])

        h_pool2_flat=tf.reshape(h_pool2,[-1,56*56*64])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1) + b_fc1)

    #Dropout
    with tf.name_scope('dropout'):
        keep_prob=tf.placeholder(tf.float32)
        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    #FC2
    with tf.name_scope('fc2'):
        W_fc2 = weight_variable([1024,7])
        b_fc2 = bias_variable([7])

        y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
    return y_conv, keep_prob 

def main():

    #Read input
    joints,imgs,labels = dataProc.pre_proc_data()

    labels = dataProc.ConvertLabels(labels)
    labels = ReturnOneHotVector(labels) #return labels as one-hot-vector
    imgs = color.rgb2gray(imgs)
    imgs = imgs.reshape([-1, imgs.shape[1]*imgs.shape[2]])

    embed()

    #-----------------NETWORK 1 -------------------------#

    x = tf.placeholder(tf.float32,[None,imgs.shape[1]])
    y_ = tf. placeholder(tf.int8,[None, n_classes])

    '''sess = tf.Session()
    with sess.as_default():
        embed()
    '''

    #Build convnet
    y_conv, keep_prob = deepnn(x)

    with tf.name_scope('loss'):
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits = y_conv)
        cross_entropy = tf.reduce_mean(cross_entropy)

    with tf.name_scope('adam_optimizer'):
        train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
        correct_prediction = tf.cast(correct_prediction, tf.float32)

    accuracy = tf.reduce_mean(correct_prediction)
    graph_location = '/home/dhiraj/Desktop/Sample Dataset/record' #tempfile.mkdtemp()
    print('Saving graph to: %s' % graph_location)
    train_writer = tf.summary.FileWriter(graph_location)
    train_writer.add_graph(tf.get_default_graph())

    '''config = tf.ConfigProto()
    config.gpu_options.allocator_type='BFC'
    config.gpu_options.per_process_gpu_memory_fraction = 0.90
    sess = tf.Session(config = config)
    '''
    '''config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.4
    session = tf.Session(config=config)
    '''

    with  tf.Session() as sess: 
        #tf.Session() as sess:
        print "STARTED TENSORLFOW SESSION"
        sess.run(tf.initialize_all_variables())
        step = 1
        count = 0 
        labels = labels.eval()
        while step*batch_size < training_iters:
            batch_xs, batch_ys = imgs[count:count + batch_size],labels[count:count+ batch_size]
            count = count + batch_size
            sess.run(train_step, feed_dict={x:batch_xs, y_:batch_ys, keep_prob:dropout})
            if step % display_step == 0:
                acc = sess.run(accuracy, feed_dict={x:batch_xs, y_:batch_ys, keep_prob:1.})
                loss = sess.run(cross_entropy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob: 1.})
                print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)

            step = step+1
        print "Optimization Finished!"            

main()

我收到以下错误:

name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:01:00.0
Total memory: 11.90GiB
Free memory: 11.23GiB
2017-09-16 16:16:14.018792: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
2017-09-16 16:16:14.018797: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0:   Y 
2017-09-16 16:16:14.018809: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:01:00.0)
STARTED TENSORLFOW SESSION
WARNING:tensorflow:From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:175: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Iter 80, Minibatch Loss= 60.153133, Training Accuracy= 0.00000
Iter 160, Minibatch Loss= 40.757141, Training Accuracy= 0.00000
Iter 240, Minibatch Loss= 24.546730, Training Accuracy= 0.00000
Iter 320, Minibatch Loss= 7.033617, Training Accuracy= 0.25000
Iter 400, Minibatch Loss= 9.611395, Training Accuracy= 0.00000
Iter 480, Minibatch Loss= 17.850220, Training Accuracy= 0.50000
Iter 560, Minibatch Loss= 9.400875, Training Accuracy= 0.00000
Iter 640, Minibatch Loss= 7.338353, Training Accuracy= 0.25000
Iter 720, Minibatch Loss= 3.645672, Training Accuracy= 0.25000
Iter 800, Minibatch Loss= 1.157605, Training Accuracy= 0.25000
2017-09-16 16:16:31.545341: E tensorflow/core/common_runtime/bfc_allocator.cc:244] tried to allocate 0 bytes
2017-09-16 16:16:31.545369: W tensorflow/core/common_runtime/allocator_retry.cc:32] Request to allocate 0 bytes
2017-09-16 16:16:31.545375: E tensorflow/core/common_runtime/bfc_allocator.cc:244] tried to allocate 0 bytes
2017-09-16 16:16:31.545379: W tensorflow/core/common_runtime/allocator_retry.cc:32] Request to allocate 0 bytes
2017-09-16 16:16:31.545433: E tensorflow/core/common_runtime/bfc_allocator.cc:378] tried to deallocate nullptr
2017-09-16 16:16:31.545482: E tensorflow/core/common_runtime/bfc_allocator.cc:378] tried to deallocate nullptr
-----------------------------------------------------------
[SOME TEXT HERE ]
    ResourceExhaustedError: Ran out of GPU memory when allocating 0 bytes for 
         [[Node: loss/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](loss/Reshape, loss/Reshape_1)]]

    Caused by op u'loss/SoftmaxCrossEntropyWithLogits', defined at:
      File "tensorflow_version_rgb.py", line 162, in <module>
        main()
      File "tensorflow_version_rgb.py", line 117, in main
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits = y_conv)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 1597, in softmax_cross_entropy_with_logits
        precise_logits, labels, name=name)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 2385, in _softmax_cross_entropy_with_logits
        features=features, labels=labels, name=name)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
        op_def=op_def)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
        original_op=self._default_original_op, op_def=op_def)
      File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
        self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): Ran out of GPU memory when allocating 0 bytes for 
     [[Node: loss/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](loss/Reshape, loss/Reshape_1)]]

我的代码的迭代取决于批量大小,如果我将其减少到1,它会运行几次迭代,然后给出同样的错误。我检查了imgs.nbytes的大小是 334774272 字节。我已经运行了其他tensorflow教程代码,它似乎工作正常。

我的输入数据'imgs'大小为(834,224,224)大小的灰度图像,标签为(834,7)矩阵。

在运行上面的代码时,它说:

Total memory: 11.90GiB
Free memory: 11.23GiB
代码运行时

我的NVIDIA-SMI输出是: nvidia-smi

0 个答案:

没有答案