在训练过程中,对于某些数据(例如,前25个样本),我的GPU占用率没有变化,但是对于某个样本,我的GPU 突然爆炸了。
系统信息
当我训练网络时,在训练的前25个样本中,内存和GPU的使用情况没有变化:
但是当涉及第46个样本时,我得到了: 并增加了GPU和内存的使用:
第96个样本何时到达。
Limit: 7012814029
InUse: 4755516416
MaxInUse: 5347089664
NumAllocs: 624595
MaxAllocSize: 2055250688
2020-06-12 22:52:55.958865: W tensorflow/core/common_runtime/bfc_allocator.cc:271] ****************__********************************************************************___***********
2020-06-12 22:52:55.959733: W tensorflow/core/framework/op_kernel.cc:1401] OP_REQUIRES failed at aggregate_ops.cc:70 : Resource exhausted: OOM when allocating tensor with shape[129675,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
Traceback (most recent call last):
File "/home/frank/anaconda3/envs/smpl/lib/python3.7/site-packages/tensorflow/python/eager/backprop.py", line 553, in _aggregate_grads
return gen_math_ops.add_n(gradients)
File "/home/frank/anaconda3/envs/smpl/lib/python3.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 428, in add_n
_six.raise_from(_core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[129675,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:AddN]
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/frank/PycharmProjects/reconstruction_NN/reconstruction_test.py", line 634, in <module>
train_model()
File "/home/frank/PycharmProjects/reconstruction_NN/reconstruction_test.py", line 538, in train_model
lo = m.train(train_dat, gt_dat, loss_dict)
File "/home/frank/PycharmProjects/reconstruction_NN/reconstruction_NN.py", line 498, in train
grad = gtape.gradient(loss_, self.trainable_variables)
File "/home/frank/anaconda3/envs/smpl/lib/python3.7/site-packages/tensorflow/python/eager/backprop.py", line 946, in gradient
unconnected_gradients=unconnected_gradients)
File "/home/frank/anaconda3/envs/smpl/lib/python3.7/site-packages/tensorflow/python/eager/imperative_grad.py", line 72, in imperative_grad
compat.as_str(unconnected_gradients.value))
File "/home/frank/anaconda3/envs/smpl/lib/python3.7/site-packages/tensorflow/python/eager/backprop.py", line 126, in _gradient_function
mock_op = _MockOp(attr_tuple, inputs, outputs, op_name)
SystemError: <class 'tensorflow.python.eager.backprop._MockOp'> returned a result with an error set
Process finished with exit code 1
似乎有问题
(几乎不可能显示我的大型项目。)我显示了如何加载数据以及如何训练NN: (一对一地加载训练对。)
for ep in range(50):
start = time.time()
loss = None
for i in range(begin_index,begin_index+feeding_size): #range(1): #range(len(gen)):
st = time.time()
# if(i == 46):
# continue
print('processing: epoch', ep, ' ',i)
train_dat = pkl.load(open('/media/frank/Elements1/data/Train_data/Training_pairs/shuffled_input_data/shuffle_input_dat270_510_%d.pkl' %i, "rb"),
encoding="latin1") # added this encoding stuff for Python3
gt_dat = pkl.load(open('/media/frank/Elements1/data/Train_data/Training_pairs/shuffled_gt_data/shuffle_gt_dat270_510_%d.pkl' %i, "rb"),
encoding="latin1") # added this encoding stuff for Python3
print("here")
for i in range(0, 10):
gt_dat['trans_' + str(i)] = np.zeros((1, 3))
lo = m.train(train_dat, gt_dat, loss_dict)
if loss is None:
loss = lo
else:
for k in lo:
loss[k] += lo[k]
print(loss)
print(time.time() - st)
for k in loss:
loss[k]/=float(feeding_size)
(火车)
def train(self, inp_dict, gt_dict, loss_dict, specific_opt=None):
# tf.print("size of NUM: ", NUM)
images = [inp_dict['image_{}'.format(i)].astype('float32') for i in range(NUM)]
J_2d = [inp_dict['J_2d_{}'.format(i)].astype('float32') for i in range(NUM)]
with tf.GradientTape() as gtape:
out_dict = self.call([images, J_2d])
loss = self.loss_model(gt_dict, out_dict, wt_dict=loss_dict)
loss_ = 0
for k in loss:
loss_ += loss[k] # get the sum of all loss.
grad = gtape.gradient(loss_, self.trainable_variables)
if specific_opt is not None: # optimize specific loss
opt_var, opt_grad = [], []
print("we are here!")
for x, g in zip(self.trainable_variables, grad):
if x.name in specific_opt:
opt_var.append(x)
opt_grad.append(g)
self.optimizer.apply_gradients(
zip(opt_grad, opt_var))
else:
print("we are here!")
self.optimizer.apply_gradients(
zip(grad, self.trainable_variables))
for k in loss:
# print(type(loss[k]))
loss[k] = loss[k].numpy() # get the sum of all loss.
# print(type(loss[k]))
# print(type(loss))
# input("ok")
return loss
有人可以帮忙吗?