如何在 pytorch 上保存训练模型的权重?

时间:2021-02-21 20:42:07

标签: computer-vision pytorch yolo

我在 Colab 中使用 PyTorch 编写了 yolo 实现代码。我可以进行训练,但无法保存模型的权重。我不断收到“用尽输入”错误。有什么解决办法吗?

Code:
"""
Main file for training Yolo model on Pascal VOC dataset
"""

import torch
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as FT #resim transformları için 
from tqdm import tqdm  #progressbar için 

from torch.utils.data import DataLoader
from model import Yolov1
from dataset import VOCDataset
from utils import (
    non_max_suppression,
    mean_average_precision,
    intersection_over_union,
    cellboxes_to_boxes,
    get_bboxes,
    plot_image,
    save_checkpoint,
    load_checkpoint,
)
from loss import YoloLoss

seed = 123
torch.manual_seed(seed) #same datasets loading

# Hyperparameters etc. 
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available else "cpu"
BATCH_SIZE = 16 
WEIGHT_DECAY = 0 #sadece bir batchte train yapmak için 0 dedik yani regularizasyon yapmıyoruz
EPOCHS = 1000
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = True
LOAD_MODEL_FILE = "/content/drive/MyDrive/YOLO 5 adım/archive/best.pt" #over fit olan modeli aldık
IMG_DIR = "/content/drive/MyDrive/YOLO 5 adım/archive/images" 
LABEL_DIR = "/content/drive/MyDrive/YOLO 5 adım/archive/labels"


class Compose(object): #buradaki neden, göndereceğimiz dönüşümümüzün yalnızca resim üzerinde işlemesi çünkü sadece resize yapıyoruz
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes


transform = Compose([transforms.Resize((448, 448)), transforms.ToTensor(),]) #resize yapıp tensöre çevirdik


def train_fn(train_loader, model, optimizer, loss_fn): #tüm datasette bir training döngüsü olacak
    loop = tqdm(train_loader, leave=True) #progressbar videosunda varmış
    mean_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update progress bar progress barda her batchteki lossu göreceğiz
        loop.set_postfix(loss=loss.item())

    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")


def main():
    model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)
    optimizer = optim.Adam(
        model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )
    loss_fn = YoloLoss()

    **if LOAD_MODEL:
        load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)  ** I got the error from this line

    train_dataset = VOCDataset(
        "/content/drive/MyDrive/YOLO 5 adım/archive/8examples.csv", #eğer tüm dataseti istiyorsak train.csv yaparız biz burada 100 örnek aldık sadece
        transform=transform,
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
    )

    test_dataset = VOCDataset( #test kısmı
        "/content/drive/MyDrive/YOLO 5 adım/archive/test.csv", transform=transform, img_dir=IMG_DIR, label_dir=LABEL_DIR,
    )

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=True,
        drop_last=False,#8 tanede denerken burayı false yaptık eğer çok az örnekte deniyorsak false fazla örnekte deniyorsak true yapıyoruz. çok az örnek yani tek bir batchte gerçeklesen örnekse false
    )

    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=True,
        drop_last=True, 
    )

  
    for epoch in range(EPOCHS):
        for x, y in train_loader:  #BU KISIM GÖRSELLEŞTİRME İÇİN SONUCU BİZE GÖSTERİYOR TEK TEK RESİMLERDE
            x = x.to(DEVICE)              #Bu kısmı uygularken yukarıda hiperparametreler kısmındaki LOAD_MODAL'ı True yapmamız gerekiyor.
            for idx in range(8):
                bboxes = cellboxes_to_boxes(model(x))
                bboxes = non_max_suppression(bboxes[idx], iou_threshold=0.5, threshold=0.4, box_format="midpoint")
                plot_image(x[idx].permute(1,2,0).to("cpu"), bboxes)

            import sys
            sys.exit()

        pred_boxes, target_boxes = get_bboxes( train_loader, model, iou_threshold=0.5, threshold=0.4 )

        mean_avg_prec = mean_average_precision(
            pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint"
        )
        print(f"Train mAP: {mean_avg_prec}") #her epoch için yazdırdık

        if mean_avg_prec > 0.9:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint, filename=LOAD_MODEL_FILE)   #kaydetmek için buu
            import time
            time.sleep(10)

        train_fn(train_loader, model, optimizer, loss_fn)


if __name__ == "__main__":
    main()

https://colab.research.google.com/drive/1l6Z86Qk8qu7Oo7fV6YNaWmJRFB9NPjCv?usp=sharing enter image description here

我从这一行出错(在图片中)

#save_checkpoint function from utils
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

1 个答案:

答案 0 :(得分:0)

我看到你的代码是由 Youtuber "Aladdin Persson" 实现的

如果我看到 load_checkpoint ,就是这样。

def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])  

当您第一次训练模型时,您应该有 LOAD_MODEL = False &
一旦检查点以这个名称 "overfit.pth.tar" 保存,那么只有你可以加载它..
我已经测试了代码并且可以运行