训练图注意力网络时,Pytorch RuntimeError:CUDA内存不足

时间:2020-03-28 05:06:14

标签: out-of-memory pytorch gpu knowledge-graph

我正在尝试重现pytorch在论文中提出的模型的结果。该模型利用吸引机制来实现知识图中关系预测的目的。它主要分为两部分:图注意力网络(GAT)和卷积网络。培训GAT的代码如下:

def train_gat(args):

    # Creating the gat model here.
    ####################################


    current_batch_2hop_indices = torch.tensor([])
    if(args.use_2hop):
        current_batch_2hop_indices = Corpus_.get_batch_nhop_neighbors_all(args,
                                                                          Corpus_.unique_entities_train, node_neighbors_2hop)

    if CUDA:
        current_batch_2hop_indices = Variable(
            torch.LongTensor(current_batch_2hop_indices)).cuda()
    else:
        current_batch_2hop_indices = Variable(
            torch.LongTensor(current_batch_2hop_indices))

    print("Defining model")

    print(
        "\nModel type -> GAT layer with {} heads used , Initital Embeddings training".format(args.nheads_GAT[0]))
    model_gat = SpKBGATModified(entity_embeddings, relation_embeddings, args.entity_out_dim, args.entity_out_dim,
                                args.drop_GAT, args.alpha, args.nheads_GAT)


    if CUDA:
        model_gat.cuda()
        model_gat=torch.nn.DataParallel(model_gat,device_ids=device_ids)


    optimizer = torch.optim.Adam(
        model_gat.parameters(), lr=args.lr, weight_decay=args.weight_decay_gat)

    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer, step_size=500, gamma=0.5, last_epoch=-1)

    gat_loss_func = nn.MarginRankingLoss(margin=args.margin)


    epoch_losses = []   # losses of all epochs
    print("Number of epochs {}".format(args.epochs_gat))

    for epoch in range(args.epochs_gat):
        print("\nepoch-> ", epoch)
        random.shuffle(Corpus_.train_triples)
        Corpus_.train_indices = np.array(
            list(Corpus_.train_triples)).astype(np.int32)

        model_gat.train()  # getting in training mode
        start_time = time.time()
        epoch_loss = []

        if len(Corpus_.train_indices) % args.batch_size_gat == 0:
            num_iters_per_epoch = len(
                Corpus_.train_indices) // args.batch_size_gat
        else:
            num_iters_per_epoch = (
                len(Corpus_.train_indices) // args.batch_size_gat) + 1

        for iters in range(num_iters_per_epoch):
            start_time_iter = time.time()
            train_indices, train_values = Corpus_.get_iteration_batch(iters)
            if CUDA:
                train_indices = Variable(
                    torch.LongTensor(train_indices)).cuda()
                train_values = Variable(torch.FloatTensor(train_values)).cuda()

            else:
                train_indices = Variable(torch.LongTensor(train_indices))
                train_values = Variable(torch.FloatTensor(train_values))

            # forward pass

            entity_embed, relation_embed = model_gat(
                Corpus_, Corpus_.train_adj_matrix, train_indices, current_batch_2hop_indices)

            optimizer.zero_grad()

            loss = batch_gat_loss(
                gat_loss_func, train_indices, entity_embed, relation_embed)

            loss.backward()
            optimizer.step()

            epoch_loss.append(loss.data.item())

            end_time_iter = time.time()

            print("Iteration-> {0}  , Iteration_time-> {1:.4f} , Iteration_loss {2:.4f}".format(
                iters, end_time_iter - start_time_iter, loss.data.item()))

        scheduler.step()
        print("Epoch {} , average loss {} , epoch_time {}".format(
            epoch, sum(epoch_loss) / len(epoch_loss), time.time() - start_time))
        epoch_losses.append(sum(epoch_loss) / len(epoch_loss))

        save_model(model_gat, args.data, epoch,
                   args.output_folder)

用于计算GAT损失的代码如下:

def batch_gat_loss(gat_loss_func, train_indices, entity_embed, relation_embed):
    len_pos_triples = int(
        train_indices.shape[0] / (int(args.valid_invalid_ratio_gat) + 1))

    pos_triples = train_indices[:len_pos_triples]
    neg_triples = train_indices[len_pos_triples:]

    pos_triples = pos_triples.repeat(int(args.valid_invalid_ratio_gat), 1)

    source_embeds = entity_embed[pos_triples[:, 0]]
    relation_embeds = relation_embed[pos_triples[:, 1]]
    tail_embeds = entity_embed[pos_triples[:, 2]]

    x = source_embeds + relation_embeds - tail_embeds
    pos_norm = torch.norm(x, p=1, dim=1)

    source_embeds = entity_embed[neg_triples[:, 0]]
    relation_embeds = relation_embed[neg_triples[:, 1]]
    tail_embeds = entity_embed[neg_triples[:, 2]]

    x = source_embeds + relation_embeds - tail_embeds
    neg_norm = torch.norm(x, p=1, dim=1)

    y = -torch.ones(int(args.valid_invalid_ratio_gat) * len_pos_triples).cuda()

    loss = gat_loss_func(pos_norm, neg_norm, y)
    return loss

本文中用于训练模型的数据集为FB15k-237,NELL-995和umls。他们的训练集中的三元组数目为:272115、149678和5216。我成功地在所有三个数据集上重现了结果。但是,当我尝试使用新的数据集YAGO3-10(具有1079040训练数据)来训练模型时,出现了GPU内存溢出错误。错误信息如下:

Traceback (most recent call last):
  File "main.py", line 366, in <module>
    train_gat(args)
  File "main.py", line 240, in train_gat
    gat_loss_func, train_indices, entity_embed, relation_embed)
  File "main.py", line 149, in batch_gat_loss
    x = source_embeds + relation_embeds - tail_embeds
RuntimeError: CUDA out of memory. Tried to allocate 1.61 GiB (GPU 0; 15.77 GiB total capacity; 14.24 GiB already allocated; 120.25 MiB free; 440.29 MiB cached)

我尝试过的方法如下:

  1. 减小批次大小。还原后,它将在反向传播期间溢出。错误消息如下:

    Traceback (most recent call last): File "main.py", line 366, in <module> train_gat(args) File "main.py", line 242, in train_gat loss.backward() File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/tensor.py", line 107, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/autograd/__init__.py", line 93, in backward allow_unreachable=True) # allow_unreachable flag RuntimeError: CUDA out of memory. Tried to allocate 2.60 GiB (GPU 0; 15.77 GiB total capacity; 11.86 GiB already allocated; 2.42 GiB free; 511.96 MiB cached)

  2. 在反向传播之前,使用torch.cuda.empty_cache()删除无用的变量。但是,最大内存使用量似乎并没有改变。

  3. 使用数据并行在两个GPU上训练模型,但未成功运行,这似乎是因为某些模型参数在不同的GPU上。错误消息如下:

    Traceback (most recent call last): File "main.py", line 377, in <module> train_gat(args) File "main.py", line 244, in train_gat Corpus_, Corpus_.train_adj_matrix_dim1, Corpus_.train_adj_matrix_dim00, Corpus_.train_adj_matrix_dim01 ,train_indices, current_batch_2hop_indices) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply raise output File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker output = module(*input, **kwargs) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/user1/cxy/KBAT-2/models.py", line 148, in forward edge_list, edge_type, edge_embed, edge_list_nhop, edge_type_nhop) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/user1/cxy/KBAT-2/models.py", line 55, in forward for att in self.attentions], dim=1) File "/home/user1/cxy/KBAT-2/models.py", line 55, in <listcomp> for att in self.attentions], dim=1) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/user1/cxy/KBAT-2/layers.py", line 140, in forward edge_m = self.a.mm(edge_h) RuntimeError: arguments are located on different GPUs at /opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/THC/generic/THCTensorMathBlas.cu:255

  4. 使用apex包进行float16计算。但是,某些参数将报告nan值,这可能是使用float16的原因。

后两种方法可能有效,但是我没有成功运行代码。我想问问是否有人有使用上述方法的经验,或者有更好的建议。

由于篇幅所限,我无法显示所有代码。该模型的代码链接如下: https://github.com/deepakn97/relationPrediction

0 个答案:

没有答案