Tensorflow Python已停止运行内存不足

时间:2017-04-12 14:25:31

标签: python-3.x numpy tensorflow

我使用Tensorflow MLP来训练CIFAR 100 python数据集,但是当我执行代码时,有人可以帮助我将batch_ys送入y占位符并运行代码,我目前正在获取这个,我不确定是否还有更多,Windows 10说" Python已停止工作",这里是代码(8-3.py):

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from read import unpickle
dir = os.path.dirname(os.path.realpath(__file__))
from read_label import read_label
current_batch = 0
t1 = time.time()
# Load MNIST Data

from tensorflow.examples.tutorials.mnist import input_data
#mnist = input_data.read_data_sets(dir + "/MNIST_data/", one_hot=True)

# Learning Parameters
learning_rate = 0.001
training_epochs = 1500
batch_size = 5500
display_step = 1

# Network Parameters
n_hidden_1 = 1024 # 1st layer num features
n_hidden_2 = 1024 # 2nd layer num features
n_hidden_3 = 1024
n_hidden_4 = 1024
n_input = 3072 # MNIST data input (img shape: 28*28)
n_classes = 100 # MNIST total classes (0-9 digits)

# tf Graph input
x = tf.placeholder("float", [None, 3072])
y = tf.placeholder("float", [None, 100])

#weights layer 1
h = tf.Variable(tf.random_normal([n_input, n_hidden_1]))
#bias layer 1
bias_layer_1 = tf.Variable(tf.random_normal([n_hidden_1]))
#layer 1
layer_1 = tf.nn.relu(tf.add(tf.matmul(x,h),bias_layer_1))

#weights layer 2
w = tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2]))
#bias layer 2
bias_layer_2 = tf.Variable(tf.random_normal([n_hidden_2]))
#layer 2
layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1,w),bias_layer_2))

h1 = tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3]))
bias_layer_3 = tf.Variable(tf.random_normal([n_hidden_3]))
layer_3 = tf.nn.relu(tf.add(tf.matmul(layer_2, h1), bias_layer_3))

w1 = tf.Variable(tf.random_normal([n_hidden_3, n_hidden_4]))
bias_layer_4 = tf.Variable(tf.random_normal([n_hidden_4]))
layer_4 = tf.nn.relu(tf.add(tf.matmul(layer_3, w1), bias_layer_4))

#weights output layer
output = tf.Variable(tf.random_normal([n_hidden_4, n_classes]))
#bias output layer
bias_output = tf.Variable(tf.random_normal([n_classes]))
#output layer
output_layer = tf.matmul(layer_4, output) + bias_output

# cost function
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output_layer, y))
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output_layer, labels=y))

# optimizer
optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) 
#optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

#Plot settings
avg_set = []
epoch_set=[]

# Initializing the variables
# init = tf.initialize_all_variables()
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:

     sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.0
        # trchou
        total_batch = int(50000/batch_size)
        # total_batch = 2000
        # Loop over all batches
        for i in range(total_batch):
            # trchou
            #batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            batch_xs = unpickle('train')[b'data'][current_batch:batch_size, :]
            #batch_ys = read_label('train')[current_batch:batch_size]
            batch_ys = tf.one_hot(read_label('train')[current_batch:batch_size], depth=100, dtype="float").eval()
            print(x)
            print(batch_ys)
            #print(read_label('train').shape)
            # Fit training using batch data
            sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
            # Compute average loss
            avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch
            current_batch += batch_size
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
        avg_set.append(avg_cost)
        epoch_set.append(epoch+1)
        '''
        if(cost == 0.000000000):
            print("The cost value of this training has reached 0, exit? (y/n)")
            a = input()
            if(a == 'y'):
                print("You chose to break it.")
                break
            elif(a == 'n'):
                print("Training will continue.")
        '''
    t2 = time.time()
    t_min = int((t2-t1)/60)
    t_sec = int((t2-t1)%60)
    print("Training phase finished, time elapsed {:d}min {:d}secs.".format(t_min, t_sec))

    # Plot the learning curve
    plt.plot(epoch_set,avg_set, 'o', label='MLP Training phase')
    plt.ylabel('cost')
    plt.xlabel('epoch')
    plt.legend()
    plt.show()

    # Save the model after learning
    model_saver = tf.train.Saver()
    model_saver.save(sess, "C:/cifar-model/my_model_mlp.chkp")

    # Testing cycle
    correct_prediction = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Model accuracy:", accuracy.eval({x: unpickle('test')[b'data'], y: read_label('test')}))
'''
# Restore model & testing
with tf.Session() as sess:

    model_saver.restore(sess, "C:/model-batchsize_55000_epoch_500_4_hiddens_learningrate_0.001/my_model_mlp.chkp")
    print("Model restored.") 
    print("Initialized")
    # Test model
    correct_prediction = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Model accuracy:", accuracy.eval({x: batch_xs, y: batch_ys}))
'''

回溯:

C:\Users\Administrator\learn_tensorflow\cifar-100-python>python 8-3.py
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library cublas64_80.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library cudnn64_5.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library cufft64_80.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library nvcuda.dll locally
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_loader.cc:135] 
successfully opened CUDA library curand64_80.dll locally
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "BestSplits" device_type: "CPU"') for unknown op: BestSplits
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "CountExtremelyRandomStats" device_type: "CPU"') for unknown op: CountExtremelyRandomStats
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "FinishedNodes" device_type: "CPU"') for unknown op: FinishedNodes
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "GrowTree" device_type: "CPU"') for unknown op: GrowTree
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "ReinterpretStringToFloat" device_type: "CPU"') for unknown op: ReinterpretStringToFloat
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "SampleInputs" device_type: "CPU"') for unknown op: SampleInputs
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "ScatterAddNdim" device_type: "CPU"') for unknown op: ScatterAddNdim
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] OpKernel ('op: "TopNInsert" device_type: "CPU"') for unknown op: TopNInsert
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "TopNRemove" device_type: "CPU"') for unknown op: TopNRemove
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "TreePredictions" device_type: "CPU"') for unknown op: TreePredictions
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\framework\op_kernel.cc:943] 
OpKernel ('op: "UpdateFertileSlots" device_type: "CPU"') for unknown op: UpdateFertileSlots
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:885] Found device 0 with properties:
name: GeForce GT 730
major: 3 minor: 5 memoryClockRate (GHz) 0.9015
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.66GiB
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:906] DMA: 0
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:916] 0:   Y
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 730, pci bus id: 0000:01:00.0)
<class 'list'>
Tensor("Placeholder:0", shape=(?, 3072), dtype=float32)
<class 'list'>
Tensor("Placeholder:0", shape=(?, 3072), dtype=float32)
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\bfc_allocator.cc:244] tried to allocate 0 bytes
W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\allocator_retry.cc:32] Request to allocate 0 bytes
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:104] EigenAllocator for GPU ran out of memory when allocating 0. See error logs for more detailed info.

C:\Users\Administrator\learn_tensorflow\cifar-100-python>

2 个答案:

答案 0 :(得分:0)

内存不足,这意味着您尝试在每一步传递过多数据。

这是因为你的batch_size太高,尝试小值(比方说32)并查看它是否有效。出于性能原因,您可以在之后尝试更高的值。

使用批处理以通过执行并行操作来提高性能。换句话说,您加载更多数据以加快速度。但是加载更多数据会产生内存成本,您必须将其加载到GPU RAM中。

您有两种情况:

  • 太慢了我不使用我所有的GPU RAM =&gt;增加batch_size
  • 我的RAM =&gt;减少batch_size

如果你到达batch_size=1仍然触发OOM的点,那么你的GPU上没有足够的RAM:

  • 尝试更简单的任务,维度更少
  • 不使用GPU而是CPU,那么你将使用你的CPU RAM,通常你的CPU RAM比GPU RAM多。

答案 1 :(得分:0)

尝试注释掉model_saver.save(sess, "C:/cifar-model/my_model_mlp.chkp")行并进行检查。

我在使用saver.save选项时遇到了同样的问题,但如果没有该选项,它将运行良好。