Tensorflow:为什么说CUDA_ERROR_OUT_OF_MEMORY而且不能训练?

时间:2017-07-13 05:43:49

标签: tensorflow

我是tensorflow的新人,今天当我运行我的训练代码时,我得到错误并且无法运行。这是详细信息: 当我训练我的数据时,它会说# coding=utf-8 from color_1 import read_and_decode, get_batch import LeNet_5 import os import tensorflow as tf batch_size =16 TRAIN_STEPS = 10000 crop_size = 224 REGULARAZTION_RATE=0.0001 def train(batch_x, batch_y): image_holder = tf.placeholder(tf.float32, [batch_size, 224, 224, 3], name='x-input') label_holder = tf.placeholder(tf.float32, [batch_size], name='y-input') regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE) y = LeNet_5.inference(image_holder, train,regularizer) global_step = tf.Variable(0, trainable=False) def loss(logits, labels): labels = tf.cast(labels, tf.int64) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) return tf.add_n(tf.get_collection('losses'), name='total_loss') loss = loss(y, label_holder) train_op = tf.train.AdamOptimizer(1e-3).minimize(loss) tf.add_to_collection('train_op', train_op) saver = tf.train.Saver(max_to_keep=3) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) for i in range(TRAIN_STEPS): image_batch, label_batch = sess.run([batch_x, batch_y]) _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={image_holder: image_batch, label_holder: label_batch}) if i % 100 == 0: format_str = ('After %d step,loss on training batch is: %g') print (format_str % (i, loss_value)) coord.request_stop() coord.join(threads) def main(argv=None): image, label = read_and_decode('train_day_night.tfrecords') batch_image, batch_label = get_batch(image, label, batch_size, crop_size) train(batch_image, batch_label) if __name__ == '__main__': tf.app.run() 并执行以下几个步骤:enter image description here

然后会出现这样的错误:enter image description here

最后,它将成为:enter image description here

我的代码在这里:

# -*- coding:utf-8 -*-    
import tensorflow as tf
def inference(input_tensor,train,regularizer):
    with tf.variable_scope('layer1-conver1'):
        conv1_weights=tf.get_variable("weight",[5,5,3,32],initializer=tf.truncated_normal_initializer(stddev=0.1))
        conv1_biases=tf.get_variable("biase",[32],initializer=tf.truncated_normal_initializer(0.0))
        conv1=tf.nn.conv2d(input_tensor,conv1_weights,strides=[1,1,1,1],padding='SAME')
        relu1=tf.nn.relu(tf.nn.bias_add(conv1,conv1_biases))
    with tf.variable_scope('layer2-pool1'):
        pool1=tf.nn.max_pool(relu1,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
    with tf.variable_scope('layer3-conv2'):
        conv2_weights=tf.get_variable("weight",[5,5,32,64],initializer=tf.truncated_normal_initializer(stddev=0.1))
        conv2_biases=tf.get_variable("biase",[64],initializer=tf.truncated_normal_initializer(0.0))
        conv2=tf.nn.conv2d(pool1,conv2_weights,strides=[1,1,1,1],padding='SAME')
        relu2=tf.nn.relu(tf.nn.bias_add(conv2,conv2_biases))
    with tf.variable_scope('layer4-pool2'):
        pool2=tf.nn.max_pool(relu2,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')

    pool_shape=pool2.get_shape().as_list()
    nodes=pool_shape[1]*pool_shape[2]*pool_shape[3]
    reshaped=tf.reshape(pool2,[pool_shape[0],nodes])

    with tf.variable_scope('layer5-fc1'):
        fc1_weights=tf.get_variable("weight",[nodes,512],initializer=tf.truncated_normal_initializer(stddev=0.1))
        if regularizer !=None:
            tf.add_to_collection('losses',regularizer(fc1_weights))
        fc1_biases=tf.get_variable("biases",[512],initializer=tf.truncated_normal_initializer(0.1))

        fc1=tf.nn.relu(tf.matmul(reshaped,fc1_weights)+fc1_biases)
        if train:fc1=tf.nn.dropout(fc1,0.5)

    with tf.variable_scope('layer6-fc2'):
        fc2_weights = tf.get_variable("weight", [512, 2], initializer=tf.truncated_normal_initializer(stddev=0.1))
        if regularizer != None:
            tf.add_to_collection('losses', regularizer(fc2_weights))
        fc2_biases = tf.get_variable("biases", [2], initializer=tf.truncated_normal_initializer(0.1))
        logit=tf.matmul(fc1,fc2_weights)+fc2_biases
    return logit

我使用的LeNet_5就在这里:

SOAPAction

.tfrecords文件中的图像尺寸为224 * 224 * 3。现在我真的不知道为什么以及如何解决。你能帮助我吗?非常感谢你 !如果您需要任何信息,请告诉我。

1 个答案:

答案 0 :(得分:1)

这意味着您的GPU memory已用完。您可能希望将输入管道(get_batch() and read_and_decode())移动到CPU。您可以使用with tf.device('/cpu:0'):来实现这一目标。

此建议详见tensorflow Performance guide