我正在开发一种全卷积自动编码器,该编码器将3个通道用作输入并输出2个通道(输入:LAB,输出:AB)。因为输出应与输入大小相同,所以我使用Full Convolution。
代码:
import torch.nn as nn
class AE(nn.Module):
def __init__(self):
super(AE, self).__init__()
self.encoder = nn.Sequential(
# conv 1
nn.Conv2d(in_channels=3, out_channels=64, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
# conv 2
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
# conv 3
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
# conv 4
nn.Conv2d(in_channels=256, out_channels=512, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
# conv 5
nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(1024),
nn.ReLU()
)
self.decoder = nn.Sequential(
# conv 6
nn.ConvTranspose2d(in_channels=1024, out_channels=512, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
# conv 7
nn.Upsample(scale_factor=2, mode='bilinear'),
nn.ConvTranspose2d(in_channels=512, out_channels=256, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
# conv 8
nn.Upsample(scale_factor=2, mode='bilinear'),
nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
# conv 9
nn.Upsample(scale_factor=2, mode='bilinear'),
nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=5, stride=1, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
# conv 10 out
nn.Upsample(scale_factor=2, mode='bilinear'),
nn.ConvTranspose2d(in_channels=64, out_channels=2, kernel_size=5, stride=1, padding=1),
nn.Softmax() # multi-class classification
# TODO softmax deprecated
)
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
输出张量的大小 为:torch.Size([1、2、199、253])
实际输出张量 的大小为:torch.Size([1、2、190、238])
我的主要问题是结合Conv2d和MaxPool2d并在ConvTranspose2d中设置正确的参数值。因此,我仅对MaxPool2d使用Upsample函数,而对Conv2d使用ConvTranspose2d分别对待它们。但是我仍然有一些不对称,我真的不知道为什么。
谢谢您的帮助!
答案 0 :(得分:1)
有两个问题。
首先是填充不足:使用kernel_size=5
时,卷积每次应用都会使图像缩小4(每侧2像素),因此您需要padding=2
,而不仅仅是1,所有地方。
第二个是“不均匀”输入大小。我的意思是,一旦对卷积进行了适当的填充,就剩下了下采样操作,这些操作在每一点上都试图将图像分辨率减半。当它们失败时,它们只会返回较小的结果(整数除法会丢弃余数)。由于您的网络连续进行了4次2x下采样操作,因此您需要输入的H, W
维度是2^4=16
的倍数。然后,您实际上将获得同样形状的输出。下面的例子
import torch
import torch.nn as nn
class AE(nn.Module):
def __init__(self):
super(AE, self).__init__()
self.encoder = nn.Sequential(
# conv 1
nn.Conv2d(in_channels=3, out_channels=64, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
# conv 2
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
# conv 3
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
# conv 4
nn.Conv2d(in_channels=256, out_channels=512, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
# conv 5
nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(1024),
nn.ReLU()
)
self.decoder = nn.Sequential(
# conv 6
nn.ConvTranspose2d(in_channels=1024, out_channels=512, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(512),
nn.ReLU(),
# conv 7
nn.Upsample(scale_factor=2, mode='bilinear'),
nn.ConvTranspose2d(in_channels=512, out_channels=256, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(256),
nn.ReLU(),
# conv 8
nn.Upsample(scale_factor=2, mode='bilinear'),
nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(128),
nn.ReLU(),
# conv 9
nn.Upsample(scale_factor=2, mode='bilinear'),
nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(64),
nn.ReLU(),
# conv 10 out
nn.Upsample(scale_factor=2, mode='bilinear'),
nn.ConvTranspose2d(in_channels=64, out_channels=2, kernel_size=5, stride=1, padding=2),
nn.Softmax() # multi-class classification
)
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
input = torch.randn(1, 3, 6*16, 7*16)
output = AE()(input)
print(input.shape)
print(output.shape)