嗨,我正在尝试训练一个采用图像序列并对它们进行分类的模型。我编写了自己的数据集函数来加载数据,但是我遇到了一些令人困惑的内存问题。我读了几个有图像序列的文件夹,然后将它们填充成批。但是,当我使用nvidia-smi命令检查内存时,GPU内存正在增加到意外的水平。
现在,我将图像加载部分注释掉,只是为了找出问题所在。现在我的数据集仅返回一个大小为[num_frames,channel_num,height,widht]的torch.zero张量。我意识到,如果num_frames是一个固定数字,内存不会增加。但是,如果每批中的num_frames不同,则GPU内存开始增加。我意识到,只要有一个更大的num_frames张量,它就会增加并保持在那里。即使下一批较小,内存也不会减少。因此,改变num_frames会在某种程度上产生问题。但是因为每个文件夹中的图像数量不同,所以我需要改变它。关于nvidia-smi命令吗?它没有显示实际的内存使用情况吗?如果正确,我该如何解决?固定和可变num_frames的内存使用量确实很大。在下面,您可以看到我的完整代码。
from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms,utils
import matplotlib.pyplot as plt
import time
import os
import copy
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt
from PIL import Image
from skimage import io, transform
from torch.utils.data import Dataset, DataLoader
import h5py
from torch.nn.utils.rnn import pad_sequence
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.image_conv =nn.Sequential(
nn.Conv2d(3, 64, 3, padding = (1,1)),
nn.Conv2d(64, 64, 3, padding = (1,1)),
nn.MaxPool2d(2),
nn.ReLU(),
nn.Conv2d(64, 128, 3, padding = (1,1)),
nn.Conv2d(128, 128, 3, padding = (1,1)),
nn.MaxPool2d(2),
nn.ReLU(),
nn.Conv2d(128, 64, 3, padding = (1,1)),
nn.MaxPool2d(2),
nn.ReLU()
)
self.fc_image = nn.Linear(8*8*64, 4)
def forward(self, x):
#print(x.shape)
x = self.image_conv(x)
x = self.fc_image(x.view((-1,8*8*64)))
return x
def get_label(name): #converts names to int labels
if "ANG" in name:
return 0
elif "HAP" in name:
return 1
elif "NEU" in name:
return 2
elif "SAD" in name:
return 3
count = 0
class Audio_video_dataset(Dataset):
def __init__(self, h5_file, video_root_dir, transform=None):
self.h5_file = h5_file
with h5py.File(self.h5_file, 'r') as f:
self.keys = list(f.keys())
self.video_root_dir = video_root_dir
self.transform = transform
def __len__(self):
return len(self.keys)
def read_images(self, video_dir):
image_list=os.listdir(video_dir)
"""
X = []
for i in range(1,len(image_list)+1):
image = Image.open(os.path.join(video_dir,'image-{:d}.jpeg'.format(i)))
if self.transform:
image = self.transform(image)
X.append(image)
X = torch.stack(X, dim=0)
"""
X =torch.zeros((len(image_list),3,64,64))
print(X.shape)
return X
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
global count
count+=1
selected_elem = self.keys[idx] #current elemen (audio,video)
sequence_name = os.path.join(self.video_root_dir,
selected_elem)
video_dir = sequence_name+".flv"
label = get_label(sequence_name)
image_seq = self.read_images(video_dir)
return image_seq,label
def customBatchBuilder(samples):
image_seq, label = zip(*samples)
image_seq = pad_sequence(image_seq)
image_seq = image_seq.view((-1,3,64,64))
label = torch.Tensor(label).int()
return image_seq,label
dataset_train = Audio_video_dataset(h5_file="audio_features_4class.hdf5",
video_root_dir='cropped_face_frames',
transform=transforms.Compose([
transforms.Resize((64,64)),
transforms.ToTensor()
]))
batch_size = 1
train_loader = DataLoader(dataset_train, batch_size=batch_size,
shuffle=True, num_workers=0, collate_fn=customBatchBuilder)
train_set_size = len(dataset_train)
#valid_set_size = len(dataset_valid)
print("Train set size:",train_set_size)
#print("Test set size:",valid_set_size)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # checks if there is gpu available
print(device)
def train_model(model, criterion, optimizer, num_epochs=25,checkp_epoch=0):
since = time.time()
pbar=tqdm(range(checkp_epoch,num_epochs))
for epoch in pbar: #range(checkp_epoch,num_epochs):
model.train() # Set model to training mode
running_loss = 0.0
running_corrects = 0
for sample in train_loader:
images ,labels = sample
images = images.to(device)
batch_size = images.size(0)
labels =torch.full((batch_size,), labels[0]).long().to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(True):
outputs = model(images)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * images.size(0)
running_corrects += torch.sum(preds == labels.data)
train_loss = running_loss / train_set_size
train_acc = running_corrects.double() / train_set_size
running_loss = 0.0
running_corrects = 0
pbar.set_description("train acc {:.3} loss {:.4}".format(train_acc,train_loss))
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
# load best model weights
return model
model_ft = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.Adam(model_ft.parameters(), lr=1e-4)
num_epochs=100
model_ft = train_model(model_ft, criterion, optimizer_ft,num_epochs=num_epochs)