使用Tensor Flow和keras训练后内核重启失败

时间:2018-09-12 16:42:06

标签: python tensorflow keras cudnn

我是张量流的新手。我刚刚开始在后端使用带有张量流的Keras。我正在使用配备i7 16 GB内存和GeForce GT 750M的笔记本电脑。 Tensorflow版本1.10,Cuddatoolkit 9.1,cudnn 7.14

我已使用以下代码来训练预训练VGG16模型。我的训练集有300张图像。我检查了以下批处理大小为8的代码,它可以正常工作。但是,当我将批处理大小更改为16时,spyder中的内核在训练后无法重新加载,并出现以下错误:

tensorflow/stream_executor/cuda/cuda_driver.cc:1078] failed to synchronize the stop event: CUDA_ERROR_LAUNCH_FAILED
tensorflow/stream_executor/cuda/cuda_timer.cc:55] Internal: error destroying CUDA event in context 0000023F82C1E010: CUDA_ERROR_LAUNCH_FAILED
tensorflow/stream_executor/cuda/cuda_timer.cc:60] Internal: error destroying CUDA event in context 0000023F82C1E010: CUDA_ERROR_LAUNCH_FAILED
tensorflow/stream_executor/cuda/cuda_dnn.cc:211] Check failed: status == CUDNN_STATUS_SUCCESS (7 vs. 0)Failed to set cuDNN stream.

我使用了以下代码:

import sys
import glob
import argparse
import matplotlib.pyplot as plt
import tensorflow as tf
from keras import __version__
from keras.applications import vgg16
from keras.applications.vgg16 import preprocess_input

from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import SGD
from keras import backend as K

IM_WIDTH, IM_HEIGHT = 299, 299 
NB_EPOCHS = 2
BAT_SIZE = 16
#32
FC_SIZE = 256



def get_nb_files(directory):
  """Get number of files by searching directory recursively"""
  if not os.path.exists(directory):
    return 0
  cnt = 0
  for r, dirs, files in os.walk(directory):
    for dr in dirs:
      cnt += len(glob.glob(os.path.join(r, dr + "/*")))
  return cnt
#********************************************************
def add_new_last_layer(base_model, nb_classes):
  """Add last layer to the convnet
  Args:
    base_model: keras model excluding top
    nb_classes: # of classes
  Returns:
    new keras model with last layer
  """
  x = base_model.output

  x = GlobalAveragePooling2D()(x)

  x = Dense(FC_SIZE, activation='relu')(x) 
  predictions = Dense(nb_classes, activation='softmax')(x) #new softmax layer

  model = Model(input=base_model.input, output=predictions)
  return model
#*************************************************
def train(trndir,valdir,nbepoch,batchsize,plot):
  """Use transfer learning and fine-tuning to train a network on a new dataset"""
  nb_train_samples = get_nb_files(trndir)
  nb_classes = len(glob.glob(trndir + "/*"))
  nb_val_samples = get_nb_files(valdir)
  nb_epoch = int(nbepoch)
  batch_size = int(batchsize)

  # data prep
  train_datagen =  ImageDataGenerator(
      preprocessing_function=preprocess_input,
      rotation_range=30,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True
  )
  test_datagen = ImageDataGenerator(
      preprocessing_function=preprocess_input,
      rotation_range=30,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True
  )

  train_generator = train_datagen.flow_from_directory(
    trndir,
    target_size=(IM_WIDTH, IM_HEIGHT),
    batch_size=batch_size,
  )

  validation_generator = test_datagen.flow_from_directory(
    valdir,
    target_size=(IM_WIDTH, IM_HEIGHT),
    batch_size=batch_size,
  )
# setup model

  base_model = vgg16.VGG16(weights='imagenet', include_top=False) 

  model = add_new_last_layer(base_model, nb_classes)
  print ("setup_to_transfer_learn")

  setup_to_transfer_learn(model, base_model)
  print (" model.fit_generator")
  history_tl = model.fit_generator(
    train_generator,
    nb_epoch=nb_epoch,
    samples_per_epoch=nb_train_samples,
    validation_data=validation_generator,
    nb_val_samples=nb_val_samples,
    class_weight='auto')
    model.save("vgg-ft.model")



def setup_to_transfer_learn(model, base_model):
  """Freeze all layers and compile the model"""
  for layer in base_model.layers:
    layer.trainable = False
    #Configures the model for training.
  model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

if __name__=="__main__":
    Imgdir="D:/NHQ/coursea_computervision/TensorFlowGPU/training_set - Copy"

    K.clear_session()
    with tf.device('/gpu:0'):

        cnt=get_nb_files(Imgdir)
        print("cnt:",cnt)
        trndir=Imgdir
        valdir=Imgdir
        train(trndir,valdir,NB_EPOCHS,BAT_SIZE,True)

1 个答案:

答案 0 :(得分:0)

尝试将其放在代码的开头(如果使用GPU) 不幸的是,这可能无法正常工作,因为通常这是由于内存不足而引起的。

from keras import backend as K
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))