在训练过程中,训练上升到 3%,然后出现 Out of Memory。
但是当我想到这个错误时,我想知道一些事情,因为一开始学习很顺利,然后成为 Out of Memory 它会随着训练的进展而积累起来。什么是积累?
def init_process_group(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group("gloo", rank = rank,world_size =world_size)
def build_data_loader(vocab, infile, args, shuffle = True):
dataset = MovieDataset(vocab, infile)
if 1< args.n_gpu and shuffle:
sampler = torch.utils.data.distributed.DistributedSampler(dataset)
loader = torch.utils.data.DataLoader(dataset, batch_size=config.tconfig.batch_size, sampler = sampler, collate_fn =movie_collate_fn)
else :
sampler = None
loader = torch.utils.data.DataLoader(dataset, batch_size=config.tconfig.batch_size, sampler=sampler, shuffle=shuffle, collate_fn = movie_collate_fn)
return loader, sampler
def train_model(rank, world_size, args):
if 1 < args.n_gpu:
init_process_group(rank, world_size)
master = (world_size == 0 or rank % world_size ==0)
config.tconfig.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
print(config.tconfig)
best_epoch, best_loss, best_score = 0, 0, 0
model = MovieClassification(config.tconfig)
if os.path.isfile(args.save):
print(f"rank:{rank} load state dict from : {args.save}")
if 1 < args.n_gpu:
model.to(config.tconfig.device)
model = DistributedDataParallel(model, device_ids = [rank], find_unused_parameters = True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--gpu", default = None, type = int, required = False)
args = parser.parse_args()
config.tconfig.device = config.device
if torch.cuda.is_available():
args.n_gpu = torch.cuda.device_count() if args.gpu is None else 1
else:
args.n_gpu = 0
print("available GPU : ",args.n_gpu)
if 1 < args.n_gpu:
mp.spawn(train_model, args= (args.n_gpu,args), nprocs = args.n_gpu, join = True)
以下是oom错误信息
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/home/fightnyy/workspace/kotransformer/main.py", line 145, in train_model
loss = train_epoch(config.tconfig, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
File "/home/fightnyy/workspace/kotransformer/main.py", line 88, in train_epoch
outputs = model.forward(enc_inputs, dec_inputs)
File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/fightnyy/workspace/kotransformer/movieclassifier.py", line 24, in forward
dec_outputs, enc_self_attn_probs, self_attn_probs, dec_enc_attn_probs = self.transformer.forward(enc_inputs, dec_inputs)
File "/home/fightnyy/workspace/kotransformer/transformer.py", line 19, in forward
enc_outputs, enc_self_attn_probs = self.encoder.forward(enc_inputs)
File "/home/fightnyy/workspace/kotransformer/encoder.py", line 64, in forward
outputs, attn_prob = layer.forward(outputs, attn_mask)
File "/home/fightnyy/workspace/kotransformer/encoderlayer.py", line 24, in forward
pos_output = self.pos_ffnn(attn_output)
File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/fightnyy/workspace/kotransformer/PFFN.py", line 23, in forward
output = self.activation(self.conv1(inputs.transpose(1,2)))
File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/nn/functional.py", line 1383, in gelu
return torch._C._nn.gelu(input)
RuntimeError: CUDA out of memory. Tried to allocate 118.00 MiB (GPU 0; 10.76 GiB total capacity; 5.05 GiB already allocated; 77.44 MiB free; 5.31 GiB reserved in total by PyTorch)