Question

我正在开发一种全卷积自动编码器，该编码器将3个通道用作输入并输出2个通道（输入：LAB，输出：AB）。因为输出应与输入大小相同，所以我使用Full Convolution。

代码：

import torch.nn as nn


class AE(nn.Module):
   def __init__(self):
       super(AE, self).__init__()

        self.encoder = nn.Sequential(
           # conv 1
           nn.Conv2d(in_channels=3, out_channels=64, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(64),
           nn.ReLU(),
           nn.MaxPool2d(kernel_size=2, stride=2),

           # conv 2
           nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(128),
           nn.ReLU(),
           nn.MaxPool2d(kernel_size=2, stride=2),

           # conv 3
           nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(256),
           nn.ReLU(),
           nn.MaxPool2d(kernel_size=2, stride=2),

           # conv 4
           nn.Conv2d(in_channels=256, out_channels=512, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(512),
           nn.ReLU(),
           nn.MaxPool2d(kernel_size=2, stride=2),

           # conv 5
           nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(1024),
           nn.ReLU()

       )

       self.decoder = nn.Sequential(
           # conv 6
           nn.ConvTranspose2d(in_channels=1024, out_channels=512, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(512),
           nn.ReLU(),

           # conv 7
           nn.Upsample(scale_factor=2, mode='bilinear'),
           nn.ConvTranspose2d(in_channels=512, out_channels=256, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(256),
           nn.ReLU(),

           # conv 8
           nn.Upsample(scale_factor=2, mode='bilinear'),
           nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(128),
           nn.ReLU(),

           # conv 9
           nn.Upsample(scale_factor=2, mode='bilinear'),
           nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=5, stride=1, padding=1),
           nn.BatchNorm2d(64),
           nn.ReLU(),

           # conv 10 out
           nn.Upsample(scale_factor=2, mode='bilinear'),
           nn.ConvTranspose2d(in_channels=64, out_channels=2, kernel_size=5, stride=1, padding=1),
           nn.Softmax()    # multi-class classification

           # TODO softmax deprecated
       )

   def forward(self, x):
       x = self.encoder(x)
       x = self.decoder(x)
       return x

输出张量的大小为：torch.Size（[1、2、199、253]）

实际输出张量的大小为：torch.Size（[1、2、190、238]）

我的主要问题是结合Conv2d和MaxPool2d并在ConvTranspose2d中设置正确的参数值。因此，我仅对MaxPool2d使用Upsample函数，而对Conv2d使用ConvTranspose2d分别对待它们。但是我仍然有一些不对称，我真的不知道为什么。

谢谢您的帮助！

Answer 1

有两个问题。

首先是填充不足：使用kernel_size=5时，卷积每次应用都会使图像缩小4（每侧2像素），因此您需要padding=2，而不仅仅是1，所有地方。

第二个是“不均匀”输入大小。我的意思是，一旦对卷积进行了适当的填充，就剩下了下采样操作，这些操作在每一点上都试图将图像分辨率减半。当它们失败时，它们只会返回较小的结果（整数除法会丢弃余数）。由于您的网络连续进行了4次2x下采样操作，因此您需要输入的H, W维度是2^4=16的倍数。然后，您实际上将获得同样形状的输出。下面的例子

import torch
import torch.nn as nn

class AE(nn.Module):
    def __init__(self):
        super(AE, self).__init__()

        self.encoder = nn.Sequential(
            # conv 1
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # conv 2
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # conv 3
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # conv 4
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # conv 5
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(1024),
            nn.ReLU()
        )

        self.decoder = nn.Sequential(
            # conv 6
            nn.ConvTranspose2d(in_channels=1024, out_channels=512, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(512),
            nn.ReLU(),

            # conv 7
            nn.Upsample(scale_factor=2, mode='bilinear'),
            nn.ConvTranspose2d(in_channels=512, out_channels=256, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),

            # conv 8
            nn.Upsample(scale_factor=2, mode='bilinear'),
            nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(128),
            nn.ReLU(),

            # conv 9
            nn.Upsample(scale_factor=2, mode='bilinear'),
            nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),

            # conv 10 out
            nn.Upsample(scale_factor=2, mode='bilinear'),
            nn.ConvTranspose2d(in_channels=64, out_channels=2, kernel_size=5, stride=1, padding=2),
            nn.Softmax()    # multi-class classification
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

input = torch.randn(1, 3, 6*16, 7*16)
output = AE()(input)
print(input.shape)
print(output.shape)

为什么我的全卷积自动编码器不对称？

1 个答案: