我目前正在从事计算机视觉项目。我不断收到一个运行时错误,提示“ CUDA内存不足”。我尝试了所有可能的方法,例如减小批处理大小和图像分辨率,清除缓存,在训练开始后删除变量,减少图像数据等等……不幸的是,此错误不会停止。我的HP Pavilion笔记本电脑上装有Nvidia Geforce 940MX显卡。我已经从pytorch安装页面安装了cuda 10.2和cudNN。我的目标是使用此模型创建一个Flask网站,但我仍然对此问题有所保留。对这个问题的任何建议都会有所帮助。
这是我的代码
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import albumentations as A
from torch.utils.data import TensorDataset, DataLoader,Dataset
from torchvision import models
from collections import defaultdict
from torch.utils.data.sampler import RandomSampler
import torch.optim as optim
from torch.optim import lr_scheduler
from sklearn import model_selection
from tqdm import tqdm
import gc
# generate data from csv file
class Build_dataset(Dataset):
def __init__(self, csv, split, mode, transform=None):
self.csv = csv.reset_index(drop=True)
self.split = split
self.mode = mode
self.transform = transform
def __len__(self):
return self.csv.shape[0]
def __getitem__(self, index):
row = self.csv.iloc[index]
image = cv2.imread(row.filepath)
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
if self.transform is not None:
res = self.transform(image=image)
image = res['image'].astype(np.float32)
else:
image = image.astype(np.float32)
image = image.transpose(2, 0, 1)
data = torch.tensor(image).float()
if self.mode == 'test':
return data
else:
return data, torch.tensor(self.csv.iloc[index].target).long()
# training epoch
def train_epoch(model, loader, optimizer,loss_fn,device, scheduler,n_examples):
model = model.train()
losses = []
correct_predictions = 0
for inputs, labels in tqdm(loader):
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, labels)
correct_predictions += torch.sum(preds == labels)
losses.append(loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
# here you delete inputs and labels and then use gc.collect
del inputs, labels
gc.collect()
return correct_predictions.double() / n_examples, np.mean(losses)
# validation epoch
def val_epoch(model, loader,loss_fn, device,n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
for inputs, labels in tqdm(loader):
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, labels)
correct_predictions += torch.sum(preds == labels)
losses.append(loss.item())
# here you delete inputs and labels and then use gc.collect
del inputs, labels
gc.collect()
return correct_predictions.double() / n_examples, np.mean(losses)
def train(model,device, num_epochs):
# generate data
dataset_train = Build_dataset(df_train, 'train', 'train', transform=transforms_train)
dataset_valid = Build_dataset(df_valid, 'train', 'val', transform=transforms_val)
#load data
train_loader = DataLoader(dataset_train, batch_size = 16,sampler=RandomSampler(dataset_train), num_workers=4)
valid_loader = DataLoader(dataset_valid, batch_size = 16,shuffle = True, num_workers= 4 )
dataset_train_size = len(dataset_train)
dataset_valid_size = len(dataset_valid)
optimizer = optim.Adam(model.parameters(), lr = 3e-5)
model = model.to(device)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience = 3,threshold = 0.001, mode = 'max')
loss_fn = nn.CrossEntropyLoss().to(device)
history = defaultdict(list)
best_accuracy = 0.0
for epoch in range(num_epochs):
print(f'Epoch {epoch+1} / {num_epochs}')
print ('-'*30)
train_acc, train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device, scheduler, dataset_train_size)
print(f'Train loss {train_loss} accuracy {train_acc}')
valid_acc, valid_loss = val_epoch(model, valid_loader, loss_fn, device,dataset_valid_size)
print(f'Val loss {valid_loss} accuracy {valid_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(valid_acc)
history['val_loss'].append(valid_loss)
if valid_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = valid_acc
print('Best Accuracy: {best_accuracy}')
model.load_state_dict(torch.load('best_model_state.bin'))
return model, history
if __name__ == '__main__':
#competition data -2020
data_dir = "C:\\Users\\Aniruddh\\Documents\\kaggle\\jpeg_melanoma_2020"
#competition data - 2019
data_dir2 = "C:\\Users\\Aniruddh\\Documents\\kaggle\\jpeg_melanoma_2019"
# device
device = torch.device("cuda")
# augmenting images
image_size = 384
transforms_train = A.Compose([
A.Transpose(p=0.5),
A.VerticalFlip(p=0.5),
A.HorizontalFlip(p=0.5),
A.RandomBrightness(limit=0.2, p=0.75),
A.RandomContrast(limit=0.2, p=0.75),
A.OneOf([
A.MedianBlur(blur_limit=5),
A.GaussianBlur(blur_limit=5),
A.GaussNoise(var_limit=(5.0, 30.0)),
], p=0.7),
A.OneOf([
A.OpticalDistortion(distort_limit=1.0),
A.GridDistortion(num_steps=5, distort_limit=1.),
A.ElasticTransform(alpha=3),
], p=0.7),
A.CLAHE(clip_limit=4.0, p=0.7),
A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.5),
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, border_mode=0, p=0.85),
A.Resize(image_size, image_size),
A.Cutout(max_h_size=int(image_size * 0.375), max_w_size=int(image_size * 0.375), num_holes=1, p=0.7),
A.Normalize()
])
transforms_val = A.Compose([
A.Resize(image_size, image_size),
A.Normalize()
])
# create data
df_train = pd.read_csv("C:\\Users\\Aniruddh\\Documents\\kaggle\\jpeg_melanoma_2020\\train.csv") #/kaggle/input/siim-isic-melanoma-classification/train.csv
df_train.head()
df_train['is_ext'] = 0
df_train['filepath'] = df_train['image_name'].apply(lambda x: os.path.join(data_dir, 'train', f'{x}.jpg'))
# dataset from 2020 data
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('seborrheic keratosis', 'BKL'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('lichenoid keratosis', 'BKL'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('solar lentigo', 'BKL'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('lentigo NOS', 'BKL'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('cafe-au-lait macule', 'unknown'))
df_train['diagnosis'] = df_train['diagnosis'].apply(lambda x: x.replace('atypical melanocytic proliferation', 'unknown'))
# dataset from 2019 data
df_train2 = pd.read_csv('/content/drive/My Drive/siim_melanoma images/train_2019.csv')
df_train2 = df_train2[df_train2['tfrecord'] >= 0].reset_index(drop=True)
#df_train2['fold'] = df_train2['tfrecord'] % 5
df_train2['is_ext'] = 1
df_train2['filepath'] = df_train2['image_name'].apply(lambda x: os.path.join(data_dir2, 'train', f'{x}.jpg'))
df_train2['diagnosis'] = df_train2['diagnosis'].apply(lambda x: x.replace('NV', 'nevus'))
df_train2['diagnosis'] = df_train2['diagnosis'].apply(lambda x: x.replace('MEL', 'melanoma'))
#concat both 2019 and 2020 data
df_train = pd.concat([df_train, df_train2]).reset_index(drop=True)
# shuffle data
df = df_train.sample(frac=1).reset_index(drop=True)
# creating 8 different target values
new_target = {d: idx for idx, d in enumerate(sorted(df.diagnosis.unique()))}
df['target'] = df['diagnosis'].map(new_target)
mel_idx = new_target['melanoma']
df = df[['filepath','diagnosis', 'target', 'is_ext']]
class_names = list(df['diagnosis'].unique())
# splitting train and validation data by 20%
df_valid = df[:11471]
df_train = df[11472:].reset_index()
df_train = df_train.drop(columns = ['index'])
# create model
def create_model(n_classes):
model = models.resnet50(pretrained=True)
n_features = model.fc.in_features
model.fc = nn.Linear(n_features, n_classes)
return model.to(device)
# model
base_model = create_model(len(class_names))
# train model
base_model, history = train(base_model, device, num_epochs = 15)
代码目标
该项目的目的是通过从给定的数据集中创建8个不同的目标变量来对皮肤癌图像进行分类(即竞争是关于对良性和恶性图像进行分类,但是我将数据集上的诊断列用作目标变量作为数据确实偏斜)。使用的模型是来自Torchvision模型的Resnet-50。
这些是使用的数据 皮肤图像(今年比赛):https://www.kaggle.com/cdeotte/jpeg-melanoma-384x384 皮肤图像(去年竞赛):https://www.kaggle.com/cdeotte/jpeg-isic2019-384x384
我决定以此为基础创建Flask应用程序,但是CUDA内存始终会导致运行时错误
RuntimeError: CUDA out of memory. Tried to allocate 144.00 MiB (GPU 0; 2.00 GiB total capacity; 1.21 GiB already allocated; 43.55 MiB free; 1.23 GiB reserved in total by PyTorch)
这些是有关我的Nvidia GPU的详细信息
Sun Sep 13 19:09:34 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 451.67 Driver Version: 451.67 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce 940MX WDDM | 00000000:01:00.0 Off | N/A |
| N/A 63C P8 N/A / N/A | 37MiB / 2048MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
# more information about my GPU
==============NVSMI LOG==============
Timestamp : Sun Sep 13 19:11:22 2020
Driver Version : 451.67
CUDA Version : 11.0
Attached GPUs : 1
GPU 00000000:01:00.0
Product Name : GeForce 940MX
Product Brand : GeForce
Display Mode : Disabled
Display Active : Disabled
Persistence Mode : N/A
Accounting Mode : Disabled
Accounting Mode Buffer Size : 4000
Driver Model
Current : WDDM
Pending : WDDM
Serial Number : N/A
GPU UUID : GPU-9a8c69df-26f2-2a98-3712-ea22f6add038
Minor Number : N/A
VBIOS Version : 82.08.6D.00.8C
MultiGPU Board : No
Board ID : 0x100
GPU Part Number : N/A
Inforom Version
Image Version : N/A
OEM Object : N/A
ECC Object : N/A
Power Management Object : N/A
GPU Operation Mode
Current : N/A
Pending : N/A
GPU Virtualization Mode
Virtualization Mode : None
Host VGPU Mode : N/A
IBMNPU
Relaxed Ordering Mode : N/A
PCI
Bus : 0x01
Device : 0x00
Domain : 0x0000
Device Id : 0x134D10DE
Bus Id : 00000000:01:00.0
Sub System Id : 0x83F9103C
GPU Link Info
PCIe Generation
Max : 3
Current : 1
Link Width
Max : 4x
Current : 4x
Bridge Chip
Type : N/A
Firmware : N/A
Replays Since Reset : 0
Replay Number Rollovers : 0
Tx Throughput : 0 KB/s
Rx Throughput : 0 KB/s
Fan Speed : N/A
Performance State : P8
Clocks Throttle Reasons
Idle : Not Active
Applications Clocks Setting : Not Active
SW Power Cap : Not Active
HW Slowdown : Not Active
HW Thermal Slowdown : N/A
HW Power Brake Slowdown : N/A
Sync Boost : Not Active
SW Thermal Slowdown : Not Active
Display Clock Setting : Not Active
FB Memory Usage
Total : 2048 MiB
Used : 37 MiB
Free : 2011 MiB
BAR1 Memory Usage
Total : 256 MiB
Used : 225 MiB
Free : 31 MiB
Compute Mode : Default
Utilization
Gpu : 0 %
Memory : 0 %
Encoder : N/A
Decoder : N/A
Encoder Stats
Active Sessions : 0
Average FPS : 0
Average Latency : 0
FBC Stats
Active Sessions : 0
Average FPS : 0
Average Latency : 0
Ecc Mode
Current : N/A
Pending : N/A
ECC Errors
Volatile
Single Bit
Device Memory : N/A
Register File : N/A
L1 Cache : N/A
L2 Cache : N/A
Texture Memory : N/A
Texture Shared : N/A
CBU : N/A
Total : N/A
Double Bit
Device Memory : N/A
Register File : N/A
L1 Cache : N/A
L2 Cache : N/A
Texture Memory : N/A
Texture Shared : N/A
CBU : N/A
Total : N/A
Aggregate
Single Bit
Device Memory : N/A
Register File : N/A
L1 Cache : N/A
L2 Cache : N/A
Texture Memory : N/A
Texture Shared : N/A
CBU : N/A
Total : N/A
Double Bit
Device Memory : N/A
Register File : N/A
L1 Cache : N/A
L2 Cache : N/A
Texture Memory : N/A
Texture Shared : N/A
CBU : N/A
Total : N/A
Retired Pages
Single Bit ECC : N/A
Double Bit ECC : N/A
Pending Page Blacklist : N/A
Remapped Rows : N/A
Temperature
GPU Current Temp : 60 C
GPU Shutdown Temp : 99 C
GPU Slowdown Temp : 94 C
GPU Max Operating Temp : 90 C
Memory Current Temp : N/A
Memory Max Operating Temp : N/A
Power Readings
Power Management : N/A
Power Draw : N/A
Power Limit : N/A
Default Power Limit : N/A
Enforced Power Limit : N/A
Min Power Limit : N/A
Max Power Limit : N/A
Clocks
Graphics : 405 MHz
SM : 405 MHz
Memory : 405 MHz
Video : 396 MHz
Applications Clocks
Graphics : 1006 MHz
Memory : 1001 MHz
Default Applications Clocks
Graphics : 1004 MHz
Memory : 1001 MHz
Max Clocks
Graphics : 1241 MHz
SM : 1241 MHz
Memory : 1001 MHz
Video : 1216 MHz
Max Customer Boost Clocks
Graphics : N/A
Clock Policy
Auto Boost : N/A
Auto Boost Default : N/A
Processes : None
如果我尝试在CPU上运行此命令,则整个系统死机,必须手动重启计算机。另外,如果我尝试以较低的图像分辨率,较低的批处理大小等运行代码,则每个时期大约需要12个小时才能在CPU上完成,这绝对是不切实际的。