pytorch nn.module如何保存计算图以及何时以及如何释放它?

时间:2018-10-04 03:49:46

标签: pytorch

我从以下地方得到了一段代码: https://github.com/liorshk/facenet_pytorch/blob/master/train_triplet.py

def train(train_loader, model, optimizer, epoch):
# switch to train mode
model.train()

pbar = tqdm(enumerate(train_loader))
labels, distances = [], []


for batch_idx, (data_a, data_p, data_n,label_p,label_n) in pbar:

    data_a, data_p, data_n = data_a.cuda(), data_p.cuda(), data_n.cuda()
    data_a, data_p, data_n = Variable(data_a), Variable(data_p), \
                             Variable(data_n)

    # compute output
    out_a, out_p, out_n = model(data_a), model(data_p), model(data_n)

    # Choose the hard negatives
    d_p = l2_dist.forward(out_a, out_p)
    d_n = l2_dist.forward(out_a, out_n)
    all = (d_n - d_p < args.margin).cpu().data.numpy().flatten()
    hard_triplets = np.where(all == 1)
    if len(hard_triplets[0]) == 0:
        continue
    out_selected_a = Variable(torch.from_numpy(out_a.cpu().data.numpy()[hard_triplets]).cuda())
    out_selected_p = Variable(torch.from_numpy(out_p.cpu().data.numpy()[hard_triplets]).cuda())
    out_selected_n = Variable(torch.from_numpy(out_n.cpu().data.numpy()[hard_triplets]).cuda())

    selected_data_a = Variable(torch.from_numpy(data_a.cpu().data.numpy()[hard_triplets]).cuda())
    selected_data_p = Variable(torch.from_numpy(data_p.cpu().data.numpy()[hard_triplets]).cuda())
    selected_data_n = Variable(torch.from_numpy(data_n.cpu().data.numpy()[hard_triplets]).cuda())

    selected_label_p = torch.from_numpy(label_p.cpu().numpy()[hard_triplets])
    selected_label_n= torch.from_numpy(label_n.cpu().numpy()[hard_triplets])
    triplet_loss = TripletMarginLoss(args.margin).forward(out_selected_a, out_selected_p, out_selected_n)

    cls_a = model.forward_classifier(selected_data_a)
    cls_p = model.forward_classifier(selected_data_p)
    cls_n = model.forward_classifier(selected_data_n)

    criterion = nn.CrossEntropyLoss()
    predicted_labels = torch.cat([cls_a,cls_p,cls_n])
    true_labels = torch.cat([Variable(selected_label_p.cuda()),Variable(selected_label_p.cuda()),Variable(selected_label_n.cuda())])

    cross_entropy_loss = criterion(predicted_labels.cuda(),true_labels.cuda())

    loss = cross_entropy_loss + triplet_loss
    # compute gradient and update weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # update the optimizer learning rate
    adjust_learning_rate(optimizer)

    # log loss value
    logger.log_value('triplet_loss', triplet_loss.data[0]).step()
    logger.log_value('cross_entropy_loss', cross_entropy_loss.data[0]).step()
    logger.log_value('total_loss', loss.data[0]).step()
    if batch_idx % args.log_interval == 0:
        pbar.set_description(
            'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} \t # of Selected Triplets: {}'.format(
                epoch, batch_idx * len(data_a), len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                loss.data[0],len(hard_triplets[0])))


    dists = l2_dist.forward(out_selected_a,out_selected_n) #torch.sqrt(torch.sum((out_a - out_n) ** 2, 1))  # euclidean distance
    distances.append(dists.data.cpu().numpy())
    labels.append(np.zeros(dists.size(0)))


    dists = l2_dist.forward(out_selected_a,out_selected_p)#torch.sqrt(torch.sum((out_a - out_p) ** 2, 1))  # euclidean distance
    distances.append(dists.data.cpu().numpy())
    labels.append(np.ones(dists.size(0)))

labels = np.array([sublabel for label in labels for sublabel in label])
distances = np.array([subdist[0] for dist in distances for subdist in dist])

tpr, fpr, accuracy, val, val_std, far = evaluate(distances,labels)
print('\33[91mTrain set: Accuracy: {:.8f}\n\33[0m'.format(np.mean(accuracy)))
logger.log_value('Train Accuracy', np.mean(accuracy))

plot_roc(fpr,tpr,figure_name="roc_train_epoch_{}.png".format(epoch))

# do checkpointing
torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict()},
           '{}/checkpoint_{}.pth'.format(LOG_DIR, epoch))

如您所见,有一堆model.forward(..)调用:

out_a, out_p, out_n = model(data_a), model(data_p), model(data_n)
....
cls_a = model.forward_classifier(selected_data_a)
cls_p = model.forward_classifier(selected_data_p)
cls_n = model.forward_classifier(selected_data_n)
(model.forward_classifier() will invoke model.forward() also)

由于调用了loss.backward(),在第二次迭代开始之前,释放了一点gpu内存,但没有释放超过30G gpu内存(我有4x1080ti),然后训练因GPU内存不足而崩溃,所以问题是:

1,在nn.module中,如何(或在哪里)保存计算图,如何释放gpu内存?

2,必须调用backward()吗?如果可以,.backward()方法应该是什么?

3,以我为例,释放了GPU内存的哪一部分,保留了其中的那一部分?有没有办法看到它?

0 个答案:

没有答案