我试图在我的per_process_gpu_memory_fraction
中增加tf.GPUOptions()
的值,然后用set_session()
更改Keras会话,但是,内存部分实际上从未改变。在while循环的第一次运行之后,如nvidia-smi
所示,保留了319MB的空间
a)永远不会在调用clear_session()
时被释放,并且
b)不会在while循环的下一次迭代中出现。
import GPUtil
import time
import tensorflow as tf
import numpy as np
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
def model_trainer():
y_pred = None
errors = 0
total_ram = GPUtil.getGPUs()[0].memoryTotal
total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
mem_amount = 0.005 # intentionally allocated a small amount so it needs to
# increment the mem_amount
x_train = np.empty((10000, 100))
y_train = np.random.randint(0, 9, size=10000)
y_train = to_categorical(y_train, 10)
while y_pred is None:
print("mem", mem_amount)
if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
config = tf.ConfigProto(
intra_op_parallelism_threads=2,
inter_op_parallelism_threads=2,
gpu_options=gpu_options)
sess = tf.Session(config=config)
set_session(sess)
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
try:
print(sess)
model.fit(x_train, y_train, epochs=5, batch_size=32)
y_pred = model.predict(x_train)
except (ResourceExhaustedError, UnknownError) as e:
if mem_amount > 1.0:
raise ValueError('model too large for vram')
else:
mem_amount += 0.05
clear_session()
errors += 1
pass
else:
clear_session()
if __name__ == "__main__":
model_trainer()
令人困惑的是Keras愿意参加新的会话(如get_session()
呼叫所示),但不会应用新的GPUOptions
。
除了上面的示例,我还尝试过:
clear_session()
del model
clear_session()
del model
gc.collect()
这些都没有发布VRAM。
我的总体目标是使用“试验和错误”,直到该过程有足够的VRAM进行训练为止,因为似乎没有一种好的方法就可以确定Keras模型需要多少VRAM而不仅仅是运行它,所以我可以在单个GPU上并行运行多个模型。发生ResourceExhaustedError
时,我想释放Keras持有的VRAM ,然后然后再尝试使用更大数量的VRAM 。有什么办法可以做到这一点?
答案 0 :(得分:0)
搜索了一段时间后,我发现Tensorflow将仅使用VRAM,并且即使使用了del模型clear_session(),也不会在其死之前将其释放。我还尝试了此处显示的方法(https://github.com/keras-team/keras/issues/9379),该方法使用:
from keras import backend as K
K.clear_session()
from numba import cuda
cuda.select_device(0)
cuda.close()
这对我造成了错误,因为Tensorflow再次尝试访问GPU时,其指向内存空间的指针无效(因为它被cuda.close()杀死了)。因此,解决此问题的唯一方法是使用进程,而不是线程(也尝试过,与以前一样,都是同样的问题)。
我发现的另一件事是,尽管有一些方法可以估算Keras模型将使用的VRAM数量,但这并不是一种非常准确的方法。 (请参阅:How to determine needed memory of Keras model?)我还尝试直接从Keras层进行计算,并且计算差异很大,因此也不准确。因此,实际上,您只能通过捕获ResourceExhaustedError并重试来尝试错误。
下面是我的代码,用于在单个GPU上运行多个不同的Keras模型。
import GPUtil
import time
import multiprocessing
import tensorflow as tf
import numpy as np
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
def model_trainer():
mem_amount = 0.05
x_train = np.empty((100000, 100))
y_train = np.random.randint(0, 9, size=100000)
y_train = to_categorical(y_train, 10)
manager = multiprocessing.Manager()
return_dict = manager.dict()
def worker(mem_amount, return_dict):
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
config = tf.ConfigProto(
intra_op_parallelism_threads=2,
inter_op_parallelism_threads=2,
gpu_options=gpu_options)
sess = tf.Session(config=config)
set_session(sess)
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=2048, activation='relu'))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
try:
get_session()
model.fit(x_train, y_train, epochs=5, batch_size=1000)
return_dict["valid"] = True
except (ResourceExhaustedError, UnknownError) as e:
return
while "valid" not in list(return_dict.keys()):
print("mem", mem_amount)
total_ram = GPUtil.getGPUs()[0].memoryTotal
total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
# can add in a for loop to have multiple models
if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
p = multiprocessing.Process(target=worker, args=(mem_amount, return_dict))
p.start()
p.join()
print(return_dict.values())
if "valid" not in list(return_dict.keys()):
if mem_amount > 1.0:
raise ValueError('model too large for vram')
else:
mem_amount += 0.05
else:
break
else:
time.sleep(10)
if __name__ == "__main__":
model_trainer()