人脸识别模型损失不减少

时间:2020-03-05 21:59:12

标签: python deep-learning conv-neural-network

我编写了一个脚本来训练LFW数据集上用于面部识别的暹罗网络风格模型,但是训练损失丝毫没有减少。我的实现中可能存在一个错误。你能指出吗。 现在,我的代码可以:

  • 每个纪元都有50万个三元组,所有三元组都是通过数据在线生成的(因为三元组的穷举数太大)。
  • 三重采样方法:我们有一个{class_id: list of file paths with that class id}的字典。然后,我们创建可用于正类的类的列表(某些类只有1张图像,因此不能用作正类)。在任何迭代中,我们都会从该精炼列表中随机抽取一个正类,从原始列表中随机抽取一个负类。我们从正数(作为Anchor或A,正数作为P)中随机抽取2张图像,从负数(负数或N)抽取1张图像。 A,P,N形成我们的三元组。
  • 使用的模型是ResNet,其最终(512,1000)softmax层已替换为(512,128)密集层(无激活)。为避免过度拟合,只有最后一个Dense和layer4保持可训练性,其余部分则被冻结。
  • 在培训过程中,我们发现一批三重半硬(损失在0到裕度之间),并且仅使用三重做反向支撑(在FaceNet论文中提到了这一点)
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import os, glob
import numpy as np
from PIL import Image

image_size = 224
batch_size = 512
margin = 0.5
learning_rate = 1e-3
num_epochs = 1000

model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 128, bias=False)

for param in model.parameters():
  param.requires_grad = False
for param in model.fc.parameters():
  param.requires_grad = True
for param in model.layer4.parameters():
  param.requires_grad = True

optimizer = optim.Adam(params=list(model.fc.parameters())+list(model.layer4.parameters()), lr=learning_rate, weight_decay=0.05)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model).to(device)
writer = SummaryWriter(log_dir="logs/")

class TripletDataset(Dataset):
  def __init__(self, rootdir, transform):
    super().__init__()
    self.rootdir = rootdir
    self.classes = os.listdir(self.rootdir)
    self.file_paths = {c: glob.glob(os.path.join(rootdir, c, "*.jpg")) for c in self.classes}
    self.positive_classes = [c for c in self.classes if len(self.file_paths[c])>=2]
    self.transform = transform

  def __getitem__(self, index=None):
    class_pos, class_neg = None, None
    while class_pos == class_neg:
      class_pos = np.random.choice(a=self.positive_classes, size=1)[0]
      class_neg = np.random.choice(a=self.classes, size=1)[0]

    fp_a, fp_p = np.random.choice(a=self.file_paths[class_pos], size=2, replace=False)
    fp_n = np.random.choice(a=self.file_paths[class_neg], size=1)[0]

    return {
        "fp_a": fp_a,
        "fp_p": fp_p,
        "fp_n": fp_n,
        "A": self.transform(Image.open(fp_a)),
        "P": self.transform(Image.open(fp_p)),
        "N": self.transform(Image.open(fp_n)),
            }

  def __len__(self):
    return 500000


def triplet_loss(a, p, n, margin=margin):
    d_ap = (a-p).norm(p='fro', dim=1)
    d_an = (a-n).norm(p='fro', dim=1)
    loss = torch.clamp(d_ap-d_an+margin, min=0)
    return loss, d_ap.mean(), d_an.mean()

transform = transforms.Compose([
        transforms.RandomResizedCrop(image_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.596, 0.436, 0.586], [0.2066, 0.240, 0.186])
        ])
train_dataset = TripletDataset("lfw", transform)
nw = 4 if torch.cuda.is_available() else 0
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0, shuffle=True)

num_batches = len(train_dataloader)
model.train()
running_loss = 0

for epoch in range(num_epochs):
    for batch_id, dictionary in enumerate(train_dataloader):
        a, p, n = dictionary["A"], dictionary["P"], dictionary["N"]
        a, p, n = a.to(device), p.to(device), n.to(device)
        emb_a, emb_p, emb_n = model(a), model(p), model(n)
        losses, d_ap, d_an = triplet_loss(a=emb_a, p=emb_p, n=emb_n)

        semi_hard_triplets = torch.where((losses>0) & (losses<margin))
        losses = losses[semi_hard_triplets]
        loss = losses.mean()
        loss.backward()
        optimizer.step()  
        running_loss += loss.item()

        print("Epoch {} Batch {}/{} Loss = {} Avg AP dist = {} Avg AN dist = {}".format(epoch, batch_id, num_batches, loss.item(), d_ap.item(), d_an.item()), flush=True)
        writer.add_scalar("Loss/Train", loss.item(), epoch*num_batches+batch_id)
        writer.add_scalars("AP_AN_Distances", {"AP": d_ap.item(), "AN": d_an.item()}, epoch*num_batches+batch_id)

    print("Epoch {} Avg Loss {}".format(epoch, running_loss/num_batches), flush=True)
    writer.add_scalar("Epoch_Loss", running_loss/num_batches, epoch)
    torch.save(model.state_dict(), "facenet_epoch_{}.pth".format(epoch))

损失图:https://tensorboard.dev/experiment/8TgzPTjuRCOFkFV5lr5etQ/ 如果您需要其他信息来帮助您,请告诉我。

0 个答案:

没有答案