图形卡CUDA / Tensorflow的OOM

时间:2017-06-29 14:48:10

标签: tensorflow cuda deep-learning nvidia

嗨我正在尝试训练一个模式,当我收到此错误信息:如果我理解正确它说我不在VRAM但我有一个华硕GTX1080 A8G游戏应该有足够的Vram。我在一切正常之前尝试过,但突然间无处不在它不再工作了。 我的深网:

    # -*- coding: utf-8 -*-
"""
Created on Thu Jun 29 11:52:11 2017
@author: tobia
"""
#importing pre_processing libaries
import numpy as np
from keras.models import load_model
import os
#importing Deep Learning Libaries
from keras import layers
from keras.models import Sequential
from keras.callbacks import TensorBoard
from keras.layers import Flatten,Dense,Conv2D,MaxPooling2D,Dropout,BatchNormalization,Activation
def load_data():


    key_values = np.empty((0,8),dtype = 'uint8')
    picture_data = np.empty((0,60,80), dtype = 'uint8')


    for i in range(len(os.listdir('data/key_values'))):
        buffer = np.load('data/key_values/values_{0}.npy'.format(i+1))
        key_values = np.append(key_values,buffer,axis = 0)
        buffer_2 = np.load('data/video/video_{}.npy'.format(i+1))
        picture_data = np.append(picture_data,buffer_2,axis = 0) 
    picture_data = picture_data.reshape((len(key_values),60,80,1))    
    """
    train_data = np.load("data/Processed/train_data.npy")
    train = train_data[:]


    picture_data = np.array([i[1] for i in train]).reshape(-1,60,80,1)
    key_values = np.array([i[0] for i in train])
    key_values = np.squeeze(key_values)
    """
   # key_values = np.reshape(key_values,(len(key_values[:]),1,7))    
   # picture_data = np.reshape(picture_data,(len(picture_data[:]),1,60,80,1))
    return key_values,picture_data
class Network:
    def __init__(self):

        pass

    def model_1(self,picture_data,key_values): 
        model = Sequential()
        model.add(Conv2D(96, 11,input_shape = (60,80,1),activation = "relu"))
        model.add(MaxPooling2D(pool_size = 3,strides =1))
        model.add(BatchNormalization(axis = 1))
        model.add(Flatten())
        model.add(Dense(units = 8, activation ="softmax"))
        model.compile(optimizer ='adam', loss = 'categorical_crossentropy',metrics = ['accuracy'])

        model.summary()
        return model
    def start(self,picture_data,key_values):
        model = self.model_1(picture_data,key_values)
        tbCallBack= TensorBoard(log_dir = "./logs",histogram_freq=0,write_graph=True,write_images=True)
        model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack])

    def predict_key(self,live_image,model):
        self.model = model

        x = self.model.predict(live_image,batch_size =3)

        return x

input_k = input("Start new Training press: N or to contiune learning press C")
if(input_k == 'N'):
    key_values,picture_data= load_data()
    test = Network()
    test.start(picture_data,key_values)
elif(input_k == 'C'):

    model = load_model('Models/Modell.h5')
    visual = TensorBoard(log_dir = "./logs",histogram_freq=0,write_graph=True,write_images=True)
    key_values,picture_data = load_data()
    model.fit(picture_data,key_values,batch_size = 1000,epochs=1,validation_split = 0.1,callbacks = [visual])
    model.save("Models/Modell.h5")

错误讯息:

File "<ipython-input-1-73951c078cac>", line 1, in <module>
    runfile('C:/Users/tobia/Desktop/Ai_Star/ai_train.py', wdir='C:/Users/tobia/Desktop/Ai_Star')
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile
    execfile(filename, namespace)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module>
    test.start(picture_data,key_values)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 66, in start
    model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack])
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 870, in fit
    initial_epoch=initial_epoch)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1507, in fit
    initial_epoch=initial_epoch)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1156, in _fit_loop
    outs = f(ins_batch)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 2269, in __call__
    **self.session_kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 789, in run
    run_metadata_ptr)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 997, in _run
    feed_dict_string, options, run_metadata)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 1132, in _do_run
    target_list, options, run_metadata)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 1152, in _do_call
    raise type(e)(node_def, op, message)
ResourceExhaustedError: OOM when allocating tensor with shape[313344,8]
 [[Node: gradients/dense_1/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, _class=["loc:@dense_1/MatMul"], transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](flatten_1/Reshape, gradients/dense_1/Softmax_grad/mul_1)]]
Caused by op 'gradients/dense_1/MatMul_grad/MatMul_1', defined at:
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 231, in <module>
    main()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 227, in main
    kernel.start()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2808, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-73951c078cac>", line 1, in <module>
    runfile('C:/Users/tobia/Desktop/Ai_Star/ai_train.py', wdir='C:/Users/tobia/Desktop/Ai_Star')
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile
    execfile(filename, namespace)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module>
    test.start(picture_data,key_values)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 66, in start
    model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack])
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 870, in fit
    initial_epoch=initial_epoch)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1490, in fit
    self._make_train_function()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1014, in _make_train_function
    self.total_loss)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\optimizers.py", line 405, in get_updates
    grads = self.get_gradients(loss, params)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\optimizers.py", line 71, in get_gradients
    grads = K.gradients(loss, params)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 2307, in gradients
    return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 540, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 346, in _MaybeCompile
    return grad_fn()  # Exit early
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 540, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_grad.py", line 825, in _MatMulGrad
    grad_b = math_ops.matmul(a, grad, transpose_a=True)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1816, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1217, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
    self._traceback = _extract_stack()
...which was originally created as op 'dense_1/MatMul', defined at:
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 231, in <module>
    main()
[elided 20 identical lines from previous traceback]
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module>
    test.start(picture_data,key_values)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 64, in start
    model = self.model_1(picture_data,key_values)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 57, in model_1
    model.add(Dense(units = 8, activation ="softmax"))
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 476, in add
    output_tensor = layer(self.outputs[0])
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\topology.py", line 596, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\layers\core.py", line 843, in call
    output = K.dot(inputs, self.kernel)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 976, in dot
    out = tf.matmul(x, y)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1816, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1217, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
    self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[313344,8]
 [[Node: gradients/dense_1/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, _class=["loc:@dense_1/MatMul"], transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](flatten_1/Reshape, gradients/dense_1/Softmax_grad/mul_1)]]

1 个答案:

答案 0 :(得分:0)

重新启动python再试一次。 GPU内存不会释放  直到你提到代码。有些时候再次在同一个python shell中运行深度学习程序而没有提到需要使用多少内存来发生此OOM错误。参考这篇文章

How to prevent tensorflow from allocating the totality of a GPU memory?