我正在尝试在Azure Notebook中训练深度学习模型,该模型使用DSVM-Ubuntu 18.04中的GPU,该模型由标准NC6(6 vcpus,56 GiB内存)组成,并出现以下错误:
RuntimeError:CUDA内存不足。尝试分配64.00 MiB(GPU 0; 11.17 GiB总容量;已分配10.76 GiB; 50.31 MiB免费; PyTorch总共保留10.84 GiB)
我已经搜索了这方面的内容,但在网络上的任何问题中都找不到任何解决方案。错误消息中的“ PyTorch总共保留了10.84 GiB”引起了我的注意,这是否可以配置为具有较低的内存值?我希望收到有关这方面的任何意见。谢谢。
这是我的微调/培训代码
for epoch in range(EPOCHS):
for idx,article in tqdm_notebook(enumerate(article_loader)):
article_tens = torch.tensor(tokenizer.encode(article[0], max_length=1024)).unsqueeze(0).to(device)
outputs = model(article_tens, labels=article_tens)
train_loss, prediction_scores = outputs[:2]
train_loss.backward()
train_sum_loss = train_sum_loss + train_loss.detach().data
iteration_count=idx
article_count = article_count + 1
if article_count == BATCH_SIZE:
article_count = 0
batch_count += 1
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
错误的整个堆栈跟踪:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-11-2c74a22e42f7> in <module>
20 article_tens = torch.tensor(tokenizer.encode(article[0], max_length=1024)).unsqueeze(0).to(device)
21
---> 22 outputs = model(article_tens, labels=article_tens)
23
24 train_loss, prediction_scores = outputs[:2]
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, input_ids, past, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, use_cache)
602 head_mask=head_mask,
603 inputs_embeds=inputs_embeds,
--> 604 use_cache=use_cache,
605 )
606 hidden_states = transformer_outputs[0]
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, input_ids, past, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, use_cache)
486 attention_mask=attention_mask,
487 head_mask=head_mask[i],
--> 488 use_cache=use_cache,
489 )
490
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, x, layer_past, attention_mask, head_mask, use_cache)
240
241 x = x + a
--> 242 m = self.mlp(self.ln_2(x))
243 x = x + m
244
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, x)
215
216 def forward(self, x):
--> 217 h = self.act(self.c_fc(x))
218 h2 = self.c_proj(h)
219 return self.dropout(h2)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/activations.py in gelu_new(x)
27 Also see https://arxiv.org/abs/1606.08415
28 """
---> 29 return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
30
31
RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 11.17 GiB total capacity; 10.74 GiB already allocated; 320.00 KiB free; 10.89 GiB reserved in total by PyTorch)