RuntimeError:Cuda内存不足-代码或gpu问题?

时间:2020-09-13 13:59:07

标签: deep-learning pytorch gpu

我目前正在从事计算机视觉项目。我不断收到一个运行时错误,提示“ CUDA内存不足”。我尝试了所有可能的方法,例如减小批处理大小和图像分辨率,清除缓存,在训练开始后删除变量,减少图像数据等等……不幸的是,此错误不会停止。我的HP Pavilion笔记本电脑上装有Nvidia Geforce 940MX显卡。我已经从pytorch安装页面安装了cuda 10.2和cudNN。我的目标是使用此模型创建一个Flask网站,但我仍然对此问题有所保留。对这个问题的任何建议都会有所帮助。

这是我的代码

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import albumentations as A
from torch.utils.data import TensorDataset, DataLoader,Dataset
from torchvision import models
from collections import defaultdict
from torch.utils.data.sampler import RandomSampler
import torch.optim as optim
from torch.optim import lr_scheduler
from sklearn import model_selection
from tqdm import tqdm
import gc




# generate data from csv file
class Build_dataset(Dataset):

    def __init__(self, csv, split, mode, transform=None):
        self.csv = csv.reset_index(drop=True)
        self.split = split
        self.mode = mode
        self.transform = transform

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]

        image = cv2.imread(row.filepath)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        if self.transform is not None:
            res = self.transform(image=image)
            image = res['image'].astype(np.float32)
        else:
            image = image.astype(np.float32)

        image = image.transpose(2, 0, 1)
        data = torch.tensor(image).float()

        if self.mode == 'test':
            return data
        else:
            return data, torch.tensor(self.csv.iloc[index].target).long()


# training epoch          
def train_epoch(model, loader, optimizer,loss_fn,device, scheduler,n_examples):

model = model.train()

losses = []
correct_predictions = 0

for inputs, labels in tqdm(loader):
    inputs = inputs.to(device)
    labels = labels.to(device)

    outputs = model(inputs)

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    
    
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
# here you delete inputs and labels and then use gc.collect
    del inputs, labels
    gc.collect()


return correct_predictions.double() / n_examples, np.mean(losses)


# validation epoch 
def val_epoch(model, loader,loss_fn, device,n_examples):

model = model.eval()

losses = []
correct_predictions = 0

with torch.no_grad():
    for inputs, labels in tqdm(loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        # here you delete inputs and labels and then use gc.collect
        del inputs, labels
        gc.collect()
        

return correct_predictions.double() / n_examples, np.mean(losses)

def train(model,device, num_epochs):
# generate data
dataset_train = Build_dataset(df_train,  'train', 'train', transform=transforms_train)
dataset_valid = Build_dataset(df_valid, 'train', 'val', transform=transforms_val)

#load data 
train_loader = DataLoader(dataset_train, batch_size = 16,sampler=RandomSampler(dataset_train), num_workers=4)
valid_loader = DataLoader(dataset_valid, batch_size = 16,shuffle = True, num_workers= 4 )

dataset_train_size = len(dataset_train)

dataset_valid_size = len(dataset_valid)

optimizer = optim.Adam(model.parameters(), lr = 3e-5)

model = model.to(device)

scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience = 3,threshold = 0.001, mode = 'max')

loss_fn = nn.CrossEntropyLoss().to(device)

history = defaultdict(list)

best_accuracy = 0.0

for epoch in range(num_epochs):
    print(f'Epoch {epoch+1} / {num_epochs}')
    print ('-'*30)
    
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device, scheduler, dataset_train_size)
    print(f'Train loss {train_loss} accuracy {train_acc}')
    valid_acc, valid_loss = val_epoch(model, valid_loader, loss_fn, device,dataset_valid_size)
    print(f'Val   loss {valid_loss} accuracy {valid_acc}')
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(valid_acc)
    history['val_loss'].append(valid_loss)
    
    if valid_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = valid_acc
    
print('Best Accuracy: {best_accuracy}')

model.load_state_dict(torch.load('best_model_state.bin'))

return model, history


if __name__ == '__main__':
#competition data -2020
data_dir = "C:\\Users\\Aniruddh\\Documents\\kaggle\\jpeg_melanoma_2020"
#competition data - 2019
data_dir2 = "C:\\Users\\Aniruddh\\Documents\\kaggle\\jpeg_melanoma_2019"
# device
device = torch.device("cuda")

# augmenting images


image_size = 384
transforms_train = A.Compose([
    A.Transpose(p=0.5),
    A.VerticalFlip(p=0.5),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightness(limit=0.2, p=0.75),
    A.RandomContrast(limit=0.2, p=0.75),
    A.OneOf([
        A.MedianBlur(blur_limit=5),
        A.GaussianBlur(blur_limit=5),
        A.GaussNoise(var_limit=(5.0, 30.0)),
    ], p=0.7),

    A.OneOf([
        A.OpticalDistortion(distort_limit=1.0),
        A.GridDistortion(num_steps=5, distort_limit=1.),
        A.ElasticTransform(alpha=3),
    ], p=0.7),

    A.CLAHE(clip_limit=4.0, p=0.7),
    A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, border_mode=0, p=0.85),
    A.Resize(image_size, image_size),
    A.Cutout(max_h_size=int(image_size * 0.375), max_w_size=int(image_size * 0.375), num_holes=1, p=0.7),    
    A.Normalize()
])

transforms_val = A.Compose([
    A.Resize(image_size, image_size),
    A.Normalize()
])
# create data
df_train = pd.read_csv("C:\\Users\\Aniruddh\\Documents\\kaggle\\jpeg_melanoma_2020\\train.csv")  #/kaggle/input/siim-isic-melanoma-classification/train.csv
df_train.head()

df_train['is_ext'] = 0
df_train['filepath'] = df_train['image_name'].apply(lambda x: os.path.join(data_dir, 'train', f'{x}.jpg'))

# dataset from 2020 data
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('seborrheic keratosis', 'BKL'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('lichenoid keratosis', 'BKL'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('solar lentigo', 'BKL'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('lentigo NOS', 'BKL'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('cafe-au-lait macule', 'unknown'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('atypical melanocytic proliferation', 'unknown'))

# dataset from 2019 data
df_train2 = pd.read_csv('/content/drive/My Drive/siim_melanoma images/train_2019.csv')
df_train2 = df_train2[df_train2['tfrecord'] >= 0].reset_index(drop=True)
#df_train2['fold'] = df_train2['tfrecord'] % 5
df_train2['is_ext'] = 1
df_train2['filepath'] = df_train2['image_name'].apply(lambda x: os.path.join(data_dir2, 'train', f'{x}.jpg'))

df_train2['diagnosis'] = df_train2['diagnosis'].apply(lambda x: x.replace('NV', 'nevus'))
df_train2['diagnosis'] = df_train2['diagnosis'].apply(lambda x: x.replace('MEL', 'melanoma'))

#concat both 2019 and 2020 data
df_train = pd.concat([df_train, df_train2]).reset_index(drop=True)

# shuffle data
df = df_train.sample(frac=1).reset_index(drop=True)

# creating 8 different target values
new_target = {d: idx for idx, d in enumerate(sorted(df.diagnosis.unique()))}
df['target'] = df['diagnosis'].map(new_target)
mel_idx = new_target['melanoma']

df = df[['filepath','diagnosis', 'target', 'is_ext']]

class_names = list(df['diagnosis'].unique())

# splitting train and validation data by 20%
df_valid = df[:11471]
df_train = df[11472:].reset_index()
df_train = df_train.drop(columns = ['index'])

# create model

def create_model(n_classes):
    model = models.resnet50(pretrained=True)

    n_features = model.fc.in_features
    model.fc = nn.Linear(n_features, n_classes)
    return model.to(device)

# model    
base_model = create_model(len(class_names)) 

# train model      
base_model, history = train(base_model, device, num_epochs = 15) 
代码目标

该项目的目的是通过从给定的数据集中创建8个不同的目标变量来对皮肤癌图像进行分类(即竞争是关于对良性和恶性图像进行分类,但是我将数据集上的诊断列用作目标变量作为数据确实偏斜)。使用的模型是来自Torchvision模型的Resnet-50。

这些是使用的数据 皮肤图像(今年比赛):https://www.kaggle.com/cdeotte/jpeg-melanoma-384x384 皮肤图像(去年竞赛):https://www.kaggle.com/cdeotte/jpeg-isic2019-384x384

我决定以此为基础创建Flask应用程序,但是CUDA内存始终会导致运行时错误

RuntimeError: CUDA out of memory. Tried to allocate 144.00 MiB (GPU 0; 2.00 GiB total capacity; 1.21 GiB already allocated; 43.55 MiB free; 1.23 GiB reserved in total by PyTorch)

这些是有关我的Nvidia GPU的详细信息

Sun Sep 13 19:09:34 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 451.67       Driver Version: 451.67       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce 940MX      WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   63C    P8    N/A /  N/A |     37MiB /  2048MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

# more information about my GPU
==============NVSMI LOG==============

Timestamp                                 : Sun Sep 13 19:11:22 2020
Driver Version                            : 451.67
CUDA Version                              : 11.0

Attached GPUs                             : 1
GPU 00000000:01:00.0
Product Name                          : GeForce 940MX
Product Brand                         : GeForce
Display Mode                          : Disabled
Display Active                        : Disabled
Persistence Mode                      : N/A
Accounting Mode                       : Disabled
Accounting Mode Buffer Size           : 4000
Driver Model
    Current                           : WDDM
    Pending                           : WDDM
Serial Number                         : N/A
GPU UUID                              : GPU-9a8c69df-26f2-2a98-3712-ea22f6add038
Minor Number                          : N/A
VBIOS Version                         : 82.08.6D.00.8C
MultiGPU Board                        : No
Board ID                              : 0x100
GPU Part Number                       : N/A
Inforom Version
    Image Version                     : N/A
    OEM Object                        : N/A
    ECC Object                        : N/A
    Power Management Object           : N/A
GPU Operation Mode
    Current                           : N/A
    Pending                           : N/A
GPU Virtualization Mode
    Virtualization Mode               : None
    Host VGPU Mode                    : N/A
IBMNPU
    Relaxed Ordering Mode             : N/A
PCI
    Bus                               : 0x01
    Device                            : 0x00
    Domain                            : 0x0000
    Device Id                         : 0x134D10DE
    Bus Id                            : 00000000:01:00.0
    Sub System Id                     : 0x83F9103C
    GPU Link Info
        PCIe Generation
            Max                       : 3
            Current                   : 1
        Link Width
            Max                       : 4x
            Current                   : 4x
    Bridge Chip
        Type                          : N/A
        Firmware                      : N/A
    Replays Since Reset               : 0
    Replay Number Rollovers           : 0
    Tx Throughput                     : 0 KB/s
    Rx Throughput                     : 0 KB/s
Fan Speed                             : N/A
Performance State                     : P8
Clocks Throttle Reasons
    Idle                              : Not Active
    Applications Clocks Setting       : Not Active
    SW Power Cap                      : Not Active
    HW Slowdown                       : Not Active
        HW Thermal Slowdown           : N/A
        HW Power Brake Slowdown       : N/A
    Sync Boost                        : Not Active
    SW Thermal Slowdown               : Not Active
    Display Clock Setting             : Not Active
FB Memory Usage
    Total                             : 2048 MiB
    Used                              : 37 MiB
    Free                              : 2011 MiB
BAR1 Memory Usage
    Total                             : 256 MiB
    Used                              : 225 MiB
    Free                              : 31 MiB
Compute Mode                          : Default
Utilization
    Gpu                               : 0 %
    Memory                            : 0 %
    Encoder                           : N/A
    Decoder                           : N/A
Encoder Stats
    Active Sessions                   : 0
    Average FPS                       : 0
    Average Latency                   : 0
FBC Stats
    Active Sessions                   : 0
    Average FPS                       : 0
    Average Latency                   : 0
Ecc Mode
    Current                           : N/A
    Pending                           : N/A
ECC Errors
    Volatile
        Single Bit
            Device Memory             : N/A
            Register File             : N/A
            L1 Cache                  : N/A
            L2 Cache                  : N/A
            Texture Memory            : N/A
            Texture Shared            : N/A
            CBU                       : N/A
            Total                     : N/A
        Double Bit
            Device Memory             : N/A
            Register File             : N/A
            L1 Cache                  : N/A
            L2 Cache                  : N/A
            Texture Memory            : N/A
            Texture Shared            : N/A
            CBU                       : N/A
            Total                     : N/A
    Aggregate
        Single Bit
            Device Memory             : N/A
            Register File             : N/A
            L1 Cache                  : N/A
            L2 Cache                  : N/A
            Texture Memory            : N/A
            Texture Shared            : N/A
            CBU                       : N/A
            Total                     : N/A
        Double Bit
            Device Memory             : N/A
            Register File             : N/A
            L1 Cache                  : N/A
            L2 Cache                  : N/A
            Texture Memory            : N/A
            Texture Shared            : N/A
            CBU                       : N/A
            Total                     : N/A
Retired Pages
    Single Bit ECC                    : N/A
    Double Bit ECC                    : N/A
    Pending Page Blacklist            : N/A
Remapped Rows                         : N/A
Temperature
    GPU Current Temp                  : 60 C
    GPU Shutdown Temp                 : 99 C
    GPU Slowdown Temp                 : 94 C
    GPU Max Operating Temp            : 90 C
    Memory Current Temp               : N/A
    Memory Max Operating Temp         : N/A
Power Readings
    Power Management                  : N/A
    Power Draw                        : N/A
    Power Limit                       : N/A
    Default Power Limit               : N/A
    Enforced Power Limit              : N/A
    Min Power Limit                   : N/A
    Max Power Limit                   : N/A
Clocks
    Graphics                          : 405 MHz
    SM                                : 405 MHz
    Memory                            : 405 MHz
    Video                             : 396 MHz
Applications Clocks
    Graphics                          : 1006 MHz
    Memory                            : 1001 MHz
Default Applications Clocks
    Graphics                          : 1004 MHz
    Memory                            : 1001 MHz
Max Clocks
    Graphics                          : 1241 MHz
    SM                                : 1241 MHz
    Memory                            : 1001 MHz
    Video                             : 1216 MHz
Max Customer Boost Clocks
    Graphics                          : N/A
Clock Policy
    Auto Boost                        : N/A
    Auto Boost Default                : N/A
Processes                             : None

如果我尝试在CPU上运行此命令,则整个系统死机,必须手动重启计算机。另外,如果我尝试以较低的图像分辨率,较低的批处理大小等运行代码,则每个时期大约需要12个小时才能在CPU上完成,这绝对是不切实际的。

1 个答案:

答案 0 :(得分:4)

我使用batch_size = 48在Kaggle上运行了您的模型,并附上了需求的屏幕截图。一个纪元大约需要30-40分钟才能完成。我想说,您可以通过Kaggle提供的30多个小时轻松训练模型。

enter image description here

我还使用batch_size=1测试了推断,并在数据加载器中设置了num_workers=0,GPU使用率为1.3GB。

enter image description here

我建议您在Kaggle / Colab上训练模型并将权重下载到本地计算机上。稍后,您可以使用batch size = 1在计算机上运行推理。推理通常会更快。