Question

我正在尝试在一组补丁上训练DenseNet模型。我的数据集有2万张用于训练的图像和6万张用于评估的图像。

这是我的主要功能：

if __name__ == "__main__":
    for epoch in range(START_EPOCH, START_EPOCH+hp.epoch):
        adjust_learning_rate(optimizer,epoch)
        train(epoch, hp.wrong_save)
        #mining(epoch)
        valid(epoch)

由于学习率下降，我会在每个时期调整学习率，然后运行将trainloader加载到GPU的训练，然后在验证集上运行验证函数，以检查每个时期的一些指标。

我有Nvidia GTX1060（6 GB），可以毫无问题地加载火车，但是当我加载valloader时，我得到：

RuntimeError: CUDA out of memory. Tried to allocate 74.12 MiB (GPU 0; 5.93 GiB total capacity; 4.73 GiB already allocated; 75.06 MiB free; 19.57 MiB cached)

我想知道您如何看待这里的最佳方法。是否可以将我的评估集不加载到GPU并运行评估？有什么解决方法吗？

有我的火车和有效功能：

# Optimization, Loss Function Init
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=hp.momentum, weight_decay=hp.weight_decay)

def train(epoch, wrong_save=False):
    ''' trian net using patches of slide.
        save csv file that has patch file name predicted incorrectly.

    Args:
        epoch (int): current epoch
        wrong_save (bool):  If True, save the csv file that has patch file name
                            predicted incorrectly
    '''

    print('\nEpoch: %d' % epoch)

    net.train()
    train_loss = 0
    correct = 0
    total = 0
    wrong_list = []

    for batch_idx, (inputs, targets, filename) in enumerate(trainloader):
        if USE_CUDA:
            inputs = inputs.cuda()
            targets = torch.FloatTensor(np.array(targets).astype(float)).cuda()

        optimizer.zero_grad()
        inputs, targets = Variable(inputs), Variable(targets)
        outputs = net(inputs)
        outputs = torch.squeeze(outputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        total += targets.size(0)
        batch_size = targets.shape[0]

        outputs += Variable((torch.ones(batch_size) * (THRESHOLD)).cuda())
        outputs = torch.floor(outputs)
        correct += outputs.data.eq(targets.data).cpu().sum()
        filename_list = filename

        if wrong_save == True:
            for idx in range(len(filename_list)):
                if outputs.data[idx] != targets.data[idx]:
                    wrong_name = filename_list[idx]
                    wrong_list.append(wrong_name)

        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
            % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))

    if wrong_save == True:
        wrong_csv = open(cf.wrong_path+'wrong_data_epoch'+str(epoch)+'.csv','w',encoding='utf-8')
        wr = csv.writer(wrong_csv)
        for name in wrong_list:
            wr.writerow([name])
        wrong_csv.close()

CUR_TRA_ACC.append(100.*correct/total)

def valid(epoch):
    ''' valid net using patches of slide.
        Save checkpoint if AUC score is higher than saved checkpoint's.

    Args:
        epoch (int): current epoch
    '''

    global BEST_AUC
    global THRESHOLD
    global LR_CHANCE
    global CK_CHANCE
    global LR_DECAY

    net.eval()
    valid_loss = 0
    total = 0
    correct = 0

    outputs_list = np.array([])
    targets_list = np.array([])

    for batch_idx, (inputs, targets) in enumerate(valloader):
        if USE_CUDA:
            inputs = inputs.cuda()
            targets = torch.FloatTensor(np.array(targets).astype(float)).cuda()

        batch_size = targets.shape[0]
        inputs, targets = Variable(inputs, volatile=True), Variable(targets)
        outputs = net(inputs)
        total += targets.size(0)
        outputs = torch.squeeze(outputs)
        loss = criterion(outputs, targets)
        valid_loss += loss.item()

        _outputs = np.array(outputs.data.cpu()).astype(float)
        _targets = np.array(targets.data.cpu()).astype(float)
        outputs_list = np.append(outputs_list, _outputs)
        targets_list = np.append(targets_list, _targets)

        outputs += Variable((torch.ones(batch_size) * (1-THRESHOLD)).cuda())
        outputs = torch.floor(outputs)
        correct += int(outputs.eq(targets).cpu().sum())

        progress_bar(batch_idx, len(valloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                    % (valid_loss/(batch_idx+1), 100.*correct/total, correct, total))

    print(tp, tn, fp, fn)
    correct, tp, tn, fp, fn, recall, precision, specificity, f1_score, auc, threshold = stats(outputs_list, targets_list)
    acc = correct/total
    THRESHOLD = threshold

    print(tp, tn, fp, fn)

    print('Acc: %.3f, Recall: %.3f, Prec: %.3f, Spec: %.3f, F1: %.3f, Thres: %.3f, AUC: %.3f'
        %(acc, recall, precision, specificity, f1_score, threshold, auc))
    print('%17s %12s\n%-11s %-8d    %-8d\n%-11s %-8d    %-8d'
        %('Tumor', 'Normal','pos',tp,fp,'neg',fn,tn))
    print("lr: ",args.lr * (0.5 ** (LR_DECAY)), "lr chance:",LR_CHANCE)

    # plot data
    CUR_EPOCH.append(epoch)
    CUR_VAL_ACC.append(acc)
    CUR_LOSS.append(valid_loss/(batch_idx+1))
    CUR_LR.append(args.lr * (0.5 ** (LR_DECAY)))

    # Save checkpoint.
    if auc > BEST_AUC:
        print('saving...')
        BEST_AUC = auc
        state = {
            'net': net if USE_CUDA else net,
            'acc': acc,
            'loss': valid_loss,
            'recall': recall,
            'specificity': specificity,
            'precision': precision,
            'f1_score': f1_score,
            'auc': auc,
            'epoch': epoch,
            'lr': args.lr * (0.5**(LR_DECAY)),
            'threshold': threshold
        }
        torch.save(state, './checkpoint/ckpt.t7')

这就是我构造数据集的方式：

def get_dataset(train_transform, test_transform, train_max, 
                val_max, subtest_max, ratio=0, mining_mode=False):
    ''' dataset function to get train, valid, subtest, test, mining dataset

    Args:
        train_transform (torchvision.transforms): train set transform for data argumentation
        test_transform (torchvision.transfroms): test set transform for data argumentation
        train_max (int): limit of trian set
        val_max (int): limit of validation set 
        subtest_max (int): limit of subtest set
        ratio (int): for mining_mode, inclusion ratio of train set compared mining set
        mining_mode (bool): If true, return mining dataset  
    '''
    train_dataset = camel(cf.dataset_path + 'train/', usage='train',
                            limit = train_max, transform=train_transform)

    val_dataset = camel(cf.dataset_path + 'validation/', usage='val',
                            limit = val_max, transform=test_transform)

    subtest_dataset = camel(cf.dataset_path + 'test/', usage='subtest', 
                            limit = subtest_max, transform=test_transform)

    test_dataset = camel(cf.test_path, usage ='test',transform=test_transform)

    if mining_mode == True:
        mining_dataset = camel(cf.dataset_path + 'mining/', usage='mining',
                                train_ratio = ratio, transform=train_transform)
        return train_dataset, val_dataset, subtest_dataset, test_dataset, mining_dataset

    else:
        return train_dataset, val_dataset, subtest_dataset, test_dataset

将验证集加载到内存中

0 个答案: