Question

我正在尝试使用Python Tensorflow CNN进行CIFAR-100培训，但错误CUDA_ERROR_OUT_OF_MEMORY，CUDA_STATUS_NOT_INITIALIZED和CUDA_STATUS_BAD_PARAM一直困扰着我，我正在使用Anaconda虚拟环境Tensorflow取决于我的机器，Python版本是Anaconda Python 3.5虚拟环境，Tensorflow版本是1.1.0，这是我的代码： tf_cifar_learning.py：

# Set working directory

import os
dir_model = "c:/tf_model_cifar100"

# Modules needed

import numpy as np
import tensorflow as tf
import pandas as pd
from mlxtend.preprocessing import one_hot


# Load CIFAR Data
from batch import next_batch
from read import unpickle
import time
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.995)


# Prepare test data

testdata = unpickle('test')[b'data']
testdata1 = testdata.astype('float')
del testdata
testdata = testdata1[0:5000, :]
testlabel = unpickle('test')[b'coarse_labels'][0:5000]
testlabel = one_hot(testlabel, 100)
for i in range(testdata.shape[0]):
    for j in range(3072):
        testdata[i][j] = float(testdata[i][j]) / 255.0
    if(i % 1000 == 0):
        print("%d of 5000 test datasets processed" % i)

# Parameters
learning_rate = 0.001
training_iters = 1000000
batch_size = 10 # 128
display_step = 2

# Network Parameters
n_input = 1024*3 # CIFAR data input (img shape: 32*32)
n_classes = 100 # CIFAR total classes
dropout = 0.75 # Dropout, probability to keep units

# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)

# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
    x = tf.nn.bias_add(x, b)
    o = tf.nn.relu(x)
    return o

def maxpool2d(x, k=2):
    # MaxPool2D wrapper
    o = tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME')
    return o

# Create model
def conv_net(x, weights, biases, dropout):
        # Reshape input picture
    x = tf.reshape(x, shape=[-1, 32, 32, 3])
    # Convolution Layer
    conv1 = conv2d(x, weights['wc1'], biases['bc1'])
    # Max Pooling (down-sampling)
    conv1 = maxpool2d(conv1, k=2)

    # Convolution Layer
    conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
    # Max Pooling (down-sampling)
    conv2 = maxpool2d(conv2, k=2)

    # Fully connected layer
    # Reshape conv2 output to fit fully connected layer input
    fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
    fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
    fc1 = tf.nn.relu(fc1)
    # Apply Dropout
    fc1 = tf.nn.dropout(fc1, dropout)

    # Output, class prediction
    out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
    return out

# Store layers weight & bias
weights = {
    # 5x5 conv, 1 input, 32 outputs
    'wc1': tf.Variable(tf.random_normal([5, 5, 3, 32])),
    # 5x5 conv, 32 inputs, 64 outputs
    'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
    # fully connected, 8*8*64 inputs, 1024 outputs
    'wd1': tf.Variable(tf.random_normal([8*8*64, 1024])),
    # 1024 inputs, 100 outputs (class prediction)
    'out': tf.Variable(tf.random_normal([1024, n_classes]))
}

biases = {
    'bc1': tf.Variable(tf.random_normal([32])),
    'bc2': tf.Variable(tf.random_normal([64])),
    'bd1': tf.Variable(tf.random_normal([1024])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# Construct model
pred = conv_net(x, weights, biases, keep_prob)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    sess.run(init)
    step = 1
    # Time measuring
    t1 = time.time()
    # Keep training until reach max iterations
    while step * batch_size < training_iters:
        # Prepare training batch
        batch_x, batch_y = next_batch(batch_size)
        batch_x1 = np.zeros([batch_size, 3072], dtype="float32")
        for i in range(batch_size):
            for j in range(3072):
                batch_x1[i][j] = batch_x[i][j] / 255.0
            #if(i % 200 == 0):
                #print("%d of %d training batch images processed" % (i, batch_size))
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x1, y: batch_y, keep_prob: dropout})
        if step % display_step == 0:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x1, y: batch_y, keep_prob: 1.})
            # Calculate accuracy for all test samples
            acc = accuracy.eval({x: testdata, y: testlabel, keep_prob: 1.})
            # Time measuring
            t2 = time.time()
            tmp = t2-t1
            sec = tmp % 60
            m = int(tmp / 60)
            print("Iter# %8d"%(step*batch_size) + \
                  ", Minibatch Loss= %16.10f"%(loss) + \
                  ", Testing Accuracy= %8.6f"%(acc) + \
                  ", Training currently elapsed " + \
                  "{:d} mins {:f} secs".format(m, sec))   
        step += 1
    print("Optimization Finished!")
    # Save the model after learning
    model_saver = tf.train.Saver()
    model_saver.save(sess, dir_model + "/CIFAR-100_cnn_model.chkp")

batch.py：

def next_batch(batch_size, onehot=True):
    class a:
        try:
            temp = current_batch
        except NameError:
            current_batch = 0
    import numpy as np
    from read import unpickle
    import tensorflow as tf
    #from mlxtend.preprocessing import one_hot
    dict_data = unpickle('train')
    label = np.array(dict_data[b'fine_labels'][a.current_batch:a.current_batch+batch_size])
    a1 = dict_data[b'data']
    a2 = a1[a.current_batch:a.current_batch+batch_size, :]
    a.current_batch += batch_size
    a2 = np.reshape(a2, (batch_size, 3072))
    with tf.device('/cpu:0'):
        if(onehot==True):
            label = tf.Session().run(tf.one_hot(label, 100))
    return a2,label

read.py：

def unpickle(file):
    import pickle
    with open(file, 'rb') as a:
        dict = pickle.load(a, encoding='bytes')
        return dict

Windows CMD python tf_cifar_learning.py输出：

(tensorflow) C:\Users\Administrator\learn_tensorflow\cifar-100-python>python tf_cifar_learning.py
0 of 5000 test datasets processed
1000 of 5000 test datasets processed
2000 of 5000 test datasets processed
3000 of 5000 test datasets processed
4000 of 5000 test datasets processed
2017-05-02 17:48:46.635855: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The     TensorFlow library wasn't compiled to use SSE instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.635975: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.637256: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE3 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.638434: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.638939: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.639456: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.641753: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.641909: W c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-05-02 17:48:46.994154: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:887] Found device 0 with properties:
name: GeForce GT 730
major: 3 minor: 5 memoryClockRate (GHz) 0.9015
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.66GiB
2017-05-02 17:48:46.994318: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:908] DMA: 0
2017-05-02 17:48:46.997080: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:918] 0:   Y
2017-05-02 17:48:46.997985: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 730, pci bus id: 0000:01:00.0)
2017-05-02 17:48:46.999359: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_driver.cc:893] failed to allocate 1.99G (2136745984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
2017-05-02 17:48:46.999434: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_driver.cc:893] failed to allocate 1.79G (1923071488 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
2017-05-02 17:48:47.766766: I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 730, pci bus id: 0000:01:00.0)
2017-05-02 17:48:48.334298: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:359] could not create cudnn handle: CUDNN_STATUS_NOT_INITIALIZED
2017-05-02 17:48:48.334466: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:366] error retrieving driver version: Unimplemented: kernel reported driver version not implemented on Windows
2017-05-02 17:48:48.343454: E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:326] could not destroy cudnn handle: CUDNN_STATUS_BAD_PARAM
2017-05-02 17:48:48.343558: F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\kernels\conv_ops.cc:659] Check failed: stream->parent()->GetConvolveAlgorithms(&algorithms)

(tensorflow) C:\Users\Administrator\learn_tensorflow\cifar-100-python>

比Windows 10说Python已停止工作并立即将其杀死，有人可以告诉我问题是什么并告诉我（或者可能给我一个例子）如何修复它？

Answer 1

问题很可能与您的环境有关。

您只有一个GPU，可能还用于显示。这就是TensorFlow无法在前面分配所需内存的原因。您可以控制per_process_gpu_memory_fraction使用多少GPU内存，如下所示：

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/test_util.py#L388

关于Cudnn，似乎Cudnn库无法自我初始化＆＃34; CUDNN_STATUS_NOT_INITIALIZED＆＃34;。您确定可以在该环境中运行其他Cuda和Cudnn样品吗？

Answer 2

尝试将per_process_gpu_memory_fraction=0.995设置为0.7或0.6

等小数据

Answer 3

现在我知道发生了什么。它实际上是一个OOM。重新启动并减少批量大小完成工作。

Answer 4

当使用conda安装的TensorFlow-gpu == 1.13.1时，我遇到相同的错误。经过几天的努力，我用下面的代码解决了这个问题：

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

我认为问题是由将显示器插入图形卡引起的。

Windows 10上的Tensorflow GPU版本CNN CuDNN错误

4 个答案: