即使Google Colab中留有RAM负载,运行时也会崩溃

时间:2019-07-15 18:54:48

标签: python tensorflow machine-learning deep-learning google-colaboratory

我正在尝试在Google Colab中运行DNN,因为我看到他们提供了“ Tesla K80”,这将使我的培训速度更快,因为我的笔记本电脑中没有非常好的GPU。

我运行了代码,运行时崩溃了。没有错误输出,只有Colab的通知说:

“使用完所有可用RAM后,您的会话崩溃了。”

我抬起头,发现了这个:

Google Colaboratory: misleading information about its GPU (only 5% RAM available to some users)

我使用:

检查了可用的GPU内存。
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()

gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

printm()

我得到了输出:

Gen RAM Free: 11.6 GB  | Proc size: 150.0 MB
GPU RAM Free: 11439MB | Used: 0MB | Util  0% | Total 11439MB

这意味着我有足够的GPU内存来训练我的模型。

我尝试重新启动并重置运行时,我也重新启动了浏览器。一切都没有。它一直崩溃。

注意:我的训练数据文件是1.4gb,所以我认为加载可能会耗尽内存。因此,我拆分了代码,加载了训练数据,然后再次运行“ check GPU ram”代码,其输出为:

Gen RAM Free: 11.6 GB  | Proc size: 1.50 GB
GPU RAM Free: 11439MB | Used: 0MB | Util  0% | Total 11439MB

仍然有很多RAM。运行时再次崩溃。

如何进行模型训练?

下面是我的完整代码

import os
from google.colab import drive

drive.mount("/drive")
os.chdir("/drive/My Drive/Colab Notebooks/GTAV/model")

检查可用的GPU内存

!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]

def printm():
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))


printm()

我的模型和训练功能

import tensorflow as tf
import tflearn
from tflearn.layers.core import fully_connected, dropout, input_data
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.layers.normalization import local_response_normalization

###################################################
RUN_ID     = "model_alexnetv2-v0.9"
EPOCHS     = 10
VS         = 2000 #Validation Samples
WIDTH      = 160
HEIGHT     = 120
CHANNELS   = 1
LR         = 1e-3
###################################################

def alexnetv2(output=3):
    network = input_data(shape=[None, WIDTH, HEIGHT, CHANNELS], name='input')
    network = conv_2d(network, 96, 11, strides=4, activation='relu')
    network = max_pool_2d(network, 3, strides=2)
    network = local_response_normalization(network)
    network = conv_2d(network, 256, 5, activation='relu')
    network = max_pool_2d(network, 3, strides=2)
    network = local_response_normalization(network)
    network = conv_2d(network, 384, 3, activation='relu')
    network = conv_2d(network, 384, 3, activation='relu')
    network = conv_2d(network, 256, 3, activation='relu')
    network = max_pool_2d(network, 3, strides=2)
    network = conv_2d(network, 256, 5, activation='relu')
    network = max_pool_2d(network, 3, strides=2)
    network = local_response_normalization(network)
    network = conv_2d(network, 384, 3, activation='relu')
    network = conv_2d(network, 384, 3, activation='relu')
    network = conv_2d(network, 256, 3, activation='relu')
    network = max_pool_2d(network, 3, strides=2)
    network = local_response_normalization(network)
    network = fully_connected(network, 4096, activation='tanh')
    network = dropout(network, 0.5)
    network = fully_connected(network, 4096, activation='tanh')
    network = dropout(network, 0.5)
    network = fully_connected(network, 4096, activation='tanh')
    network = dropout(network, 0.5)
    network = fully_connected(network, 4096, activation='tanh')
    network = dropout(network, 0.5)
    network = fully_connected(network, output, activation='softmax')
    model = regression(network, optimizer='momentum',
                         loss='categorical_crossentropy',
                         learning_rate=LR, name='targets')

    model = tflearn.DNN(model, checkpoint_path="model-alexnetv2", max_checkpoints=1,
                        tensorboard_verbose=2, tensorboard_dir="model_training_log")

    return model


def train_data(training_data, model=False):
    X = np.array([x[0] for x in training_data]).reshape(-1, WIDTH, HEIGHT, CHANNELS)
    y = [x[1] for x in training_data]
    print("   >> Samples and Labels created.!")

    train_X = X[-VS:]
    train_y = y[-VS:]
    print("   >> Validation Set created.!")

    X = X[:-VS]
    y = y[:-VS]
    print("   >> Training Set created.!")

    train_X = train_X/255.0
    X = X/255.0

    if not model:
        model = alexnetv2()


    model.fit(X, y, n_epoch=EPOCHS, show_metric=True, snapshot_step=500
              ,validation_set=(train_X, train_y), run_id=RUN_ID)

    return model

加载训练数据

import numpy as np
import time

start = time.time()
print("Loading data...//")
training_data = np.load("/drive/My Drive/Colab Notebooks/GTAV/training_data-v0.3.npy", allow_pickle=True)
print(f"{len(training_data)} training samples loaded in {np.round(time.time()-start, 2)} seconds.")

培训模型

print("-------------------------------")
print("Training model...//")

model = train_data(training_data)
#
os.mkdir(f"/drive/My Drive/Colab Notebooks/GTAV/model{RUN_ID}")
os.chdir(f"/drive/My Drive/Colab Notebooks/GTAV/model/{RUN_ID}")
#
#
print("Saving model...//")
model.save(RUN_ID)

0 个答案:

没有答案