我正在尝试在Google Colab中运行DNN,因为我看到他们提供了“ Tesla K80”,这将使我的培训速度更快,因为我的笔记本电脑中没有非常好的GPU。
我运行了代码,运行时崩溃了。没有错误输出,只有Colab的通知说:
“使用完所有可用RAM后,您的会话崩溃了。”
我抬起头,发现了这个:
Google Colaboratory: misleading information about its GPU (only 5% RAM available to some users)
我使用:
检查了可用的GPU内存。!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
gpu = GPUs[0]
def printm():
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()
我得到了输出:
Gen RAM Free: 11.6 GB | Proc size: 150.0 MB
GPU RAM Free: 11439MB | Used: 0MB | Util 0% | Total 11439MB
这意味着我有足够的GPU内存来训练我的模型。
我尝试重新启动并重置运行时,我也重新启动了浏览器。一切都没有。它一直崩溃。
注意:我的训练数据文件是1.4gb,所以我认为加载可能会耗尽内存。因此,我拆分了代码,加载了训练数据,然后再次运行“ check GPU ram”代码,其输出为:
Gen RAM Free: 11.6 GB | Proc size: 1.50 GB
GPU RAM Free: 11439MB | Used: 0MB | Util 0% | Total 11439MB
仍然有很多RAM。运行时再次崩溃。
如何进行模型训练?
(下面是我的完整代码)
import os
from google.colab import drive
drive.mount("/drive")
os.chdir("/drive/My Drive/Colab Notebooks/GTAV/model")
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()
import tensorflow as tf
import tflearn
from tflearn.layers.core import fully_connected, dropout, input_data
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.layers.normalization import local_response_normalization
###################################################
RUN_ID = "model_alexnetv2-v0.9"
EPOCHS = 10
VS = 2000 #Validation Samples
WIDTH = 160
HEIGHT = 120
CHANNELS = 1
LR = 1e-3
###################################################
def alexnetv2(output=3):
network = input_data(shape=[None, WIDTH, HEIGHT, CHANNELS], name='input')
network = conv_2d(network, 96, 11, strides=4, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = local_response_normalization(network)
network = conv_2d(network, 256, 5, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = local_response_normalization(network)
network = conv_2d(network, 384, 3, activation='relu')
network = conv_2d(network, 384, 3, activation='relu')
network = conv_2d(network, 256, 3, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = conv_2d(network, 256, 5, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = local_response_normalization(network)
network = conv_2d(network, 384, 3, activation='relu')
network = conv_2d(network, 384, 3, activation='relu')
network = conv_2d(network, 256, 3, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = local_response_normalization(network)
network = fully_connected(network, 4096, activation='tanh')
network = dropout(network, 0.5)
network = fully_connected(network, 4096, activation='tanh')
network = dropout(network, 0.5)
network = fully_connected(network, 4096, activation='tanh')
network = dropout(network, 0.5)
network = fully_connected(network, 4096, activation='tanh')
network = dropout(network, 0.5)
network = fully_connected(network, output, activation='softmax')
model = regression(network, optimizer='momentum',
loss='categorical_crossentropy',
learning_rate=LR, name='targets')
model = tflearn.DNN(model, checkpoint_path="model-alexnetv2", max_checkpoints=1,
tensorboard_verbose=2, tensorboard_dir="model_training_log")
return model
def train_data(training_data, model=False):
X = np.array([x[0] for x in training_data]).reshape(-1, WIDTH, HEIGHT, CHANNELS)
y = [x[1] for x in training_data]
print(" >> Samples and Labels created.!")
train_X = X[-VS:]
train_y = y[-VS:]
print(" >> Validation Set created.!")
X = X[:-VS]
y = y[:-VS]
print(" >> Training Set created.!")
train_X = train_X/255.0
X = X/255.0
if not model:
model = alexnetv2()
model.fit(X, y, n_epoch=EPOCHS, show_metric=True, snapshot_step=500
,validation_set=(train_X, train_y), run_id=RUN_ID)
return model
import numpy as np
import time
start = time.time()
print("Loading data...//")
training_data = np.load("/drive/My Drive/Colab Notebooks/GTAV/training_data-v0.3.npy", allow_pickle=True)
print(f"{len(training_data)} training samples loaded in {np.round(time.time()-start, 2)} seconds.")
print("-------------------------------")
print("Training model...//")
model = train_data(training_data)
#
os.mkdir(f"/drive/My Drive/Colab Notebooks/GTAV/model{RUN_ID}")
os.chdir(f"/drive/My Drive/Colab Notebooks/GTAV/model/{RUN_ID}")
#
#
print("Saving model...//")
model.save(RUN_ID)