我正在尝试实现CNN,以便在PyTorch中对图像进行回归。我已经在Keras中实现了一个工作模型,我想在PyTorch中进行翻译,但是我面临许多问题。
本质上,在Keras中,模型收敛,而在PyTorch中,模型不收敛。在PyTorch中,我始终会有恒定的训练损失,并且训练过的模型对于任何图像总是输出相同的值。我像在工作的Keras模型中一样初始化了所有层,在Keras中添加了l2正则化,我还实现了相同的学习率衰减。一切看起来完全一样,但是在PyTorch中,我的模型无法收敛。
是否有可能以相同的方式初始化所有模型,而在Keras中收敛时,模型无法在PyTorch中收敛?如果是这样,您会建议我在特定情况下做什么(一个纪元后持续不断的训练损失和不断的预测)?
我已经尝试修剪梯度并更改学习率,一开始我使用lr = 0.001的Adam,然后尝试使用0.1和0.0001,始终模拟Keras基于时间的衰减。
任务包括从this Udacity数据集中回归转向角。
预处理: -以[0,1]间隔获取图像的灰度版本,并按照以下步骤将其裁剪为200x200大小:
half_the_width = int(img.shape[1] / 2)
img = img[img.shape[0] - crop_heigth: img.shape[0], half_the_width - int(crop_width / 2):
half_the_width + int(crop_width / 2)]
因此,我输入的图像尺寸为200x200,目标尺寸为1(转向角)。
损失是MSEloss
这是我要模拟的 KERAS 模型,它使用学习率衰减为1e-5的Adam,取自DroNet repo:
def resnet8(img_width, img_height, img_channels, output_dim):
"""
Define model architecture.
# Arguments
img_width: Target image widht.
img_height: Target image height.
img_channels: Target image channels.
output_dim: Dimension of model output.
# Returns
model: A Model instance.
"""
# Input
img_input = Input(shape=(img_height, img_width, img_channels))
x1 = Conv2D(32, (5, 5), strides=[2,2], padding='same')(img_input)
x1 = MaxPooling2D(pool_size=(3, 3), strides=[2,2])(x1)
# First residual block
x2 = keras.layers.normalization.BatchNormalization()(x1)
x2 = Activation('relu')(x2)
x2 = Conv2D(32, (3, 3), strides=[2,2], padding='same',
kernel_initializer="he_normal",
kernel_regularizer=regularizers.l2(1e-4))(x2)
x2 = keras.layers.normalization.BatchNormalization()(x2)
x2 = Activation('relu')(x2)
x2 = Conv2D(32, (3, 3), padding='same',
kernel_initializer="he_normal",
kernel_regularizer=regularizers.l2(1e-4))(x2)
x1 = Conv2D(32, (1, 1), strides=[2,2], padding='same')(x1)
x3 = add([x1, x2])
# Second residual block
x4 = keras.layers.normalization.BatchNormalization()(x3)
x4 = Activation('relu')(x4)
x4 = Conv2D(64, (3, 3), strides=[2,2], padding='same',
kernel_initializer="he_normal",
kernel_regularizer=regularizers.l2(1e-4))(x4)
x4 = keras.layers.normalization.BatchNormalization()(x4)
x4 = Activation('relu')(x4)
x4 = Conv2D(64, (3, 3), padding='same',
kernel_initializer="he_normal",
kernel_regularizer=regularizers.l2(1e-4))(x4)
x3 = Conv2D(64, (1, 1), strides=[2,2], padding='same')(x3)
x5 = add([x3, x4])
# Third residual block
x6 = keras.layers.normalization.BatchNormalization()(x5)
x6 = Activation('relu')(x6)
x6 = Conv2D(128, (3, 3), strides=[2,2], padding='same',
kernel_initializer="he_normal",
kernel_regularizer=regularizers.l2(1e-4))(x6)
x6 = keras.layers.normalization.BatchNormalization()(x6)
x6 = Activation('relu')(x6)
x6 = Conv2D(128, (3, 3), padding='same',
kernel_initializer="he_normal",
kernel_regularizer=regularizers.l2(1e-4))(x6)
x5 = Conv2D(128, (1, 1), strides=[2,2], padding='same')(x5)
x7 = add([x5, x6])
x = Flatten()(x7)
x = Activation('relu')(x)
x = Dropout(0.5)(x)
# Steering channel
steer = Dense(output_dim)(x)
# Collision channel
coll = Dense(output_dim)(x)
coll = Activation('sigmoid')(coll)
# Define steering-collision model
model = Model(inputs=[img_input], outputs=[steer, coll])
print(model.summary())
return model
在 PyTorch 中,我试图仅实现转向角预测,该论文提到转向角和碰撞预测是不相关的。
这是我的PyTorch实现: 模型
def init_kernel(m):
if isinstance(m, nn.Conv2d):
# Initialize kernels of Conv2d layers as kaiming normal
nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
# Initialize biases of Conv2d layers at 0
nn.init.zeros_(m.bias)
def __init__(self, img_channels, in_height, in_width, output_dim):
super(resnet8, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(in_channels=img_channels,out_channels=32,
kernel_size=[5,5], stride=[2,2], padding=[5//2,5//2]),
nn.MaxPool2d(kernel_size=[3,3], stride=[2,2]))
self.residual_block_1a = nn.Sequential(
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(in_channels=32,out_channels=32, kernel_size=[3,3],
stride=[2,2], padding=[3//2,3//2]),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(in_channels=32,out_channels=32, kernel_size=[3,3],
padding=[3//2,3//2]))
self.parallel_conv_1 = nn.Conv2d(in_channels=32,out_channels=32,
kernel_size=[1,1], stride=[2,2],
padding=[1//2,1//2])
self.residual_block_2a = nn.Sequential(
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(in_channels=32,out_channels=64, kernel_size=[3,3],
stride=[2,2], padding=[3//2,3//2]),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(in_channels=64,out_channels=64, kernel_size=[3,3],
padding=[3//2,3//2]))
self.parallel_conv_2 = nn.Conv2d(in_channels=32,out_channels=64,
kernel_size=[1,1], stride=[2,2],
padding=[1//2,1//2])
self.residual_block_3a = nn.Sequential(
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(in_channels=64,out_channels=128, kernel_size=[3,3],
stride=[2,2], padding=[3//2,3//2]),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(in_channels=128,out_channels=128, kernel_size=[3,3],
padding=[3//2,3//2]))
self.parallel_conv_3 = nn.Conv2d(in_channels=64,out_channels=128,
kernel_size=[1,1], stride=[2,2],
padding=[1//2,1//2])
self.output_dim = output_dim
self.last_block = nn.Sequential(
nn.ReLU(),
nn.Dropout2d(),
nn.Linear(6272,self.output_dim))
# Initialize layers exactly as in Keras
for m in self.modules():
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
# Initialize kernels of Conv2d layers as kaiming normal
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
self.residual_block_1a.apply(init_kernel)
self.residual_block_2a.apply(init_kernel)
self.residual_block_3a.apply(init_kernel)
def forward(self, x):
x1 = self.layer1(x)
# First residual block
x2 = self.residual_block_1a(x1)
x1 = self.parallel_conv_1(x1)
x3 = x1.add(x2)
# Second residual block
x4 = self.residual_block_2a(x3)
x3 = self.parallel_conv_2(x3)
x5 = x3.add(x4)
# Third residual block
x6 = self.residual_block_3a(x5)
x5 = self.parallel_conv_3(x5)
x7 = x5.add(x6)
out = x7.view(x7.size(0), -1) # Flatten
out = self.last_block(out)
return out
训练圈
def compute_l2_reg(model,model_name):
# Function that sets weight_decay only for weights and not biases and only
# for conv layers inside residual layers
lambda_ = FLAGS.weight_decay
params_dict = dict(model.named_parameters())
l2_reg=[]
if model_name == 'resnet8':
for key, value in params_dict.items():
if ((key[-8:] == '2.weight' or key[-8:] == '5.weight') and key[0:8]=='residual'):
l2_reg += [lambda_*torch.norm(value.view(value.size(0),-1),2)]
l2_reg = sum(l2_reg)
return l2_reg
def train_model(model, num_epochs, learning_rate, train_loader, valid_loader,
patience, model_name):
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# To track the training loss as the model trains
train_losses = []
# To track the validation loss as the model trains
valid_losses = []
# To track the average training loss per epoch as the model trains
avg_train_losses = []
# To track the average validation loss per epoch as the model trains
avg_valid_losses = []
# Initialize the early_stopping object
early_stopping = EarlyStopping(patience=patience, verbose=True)
# Training loop
decay = FLAGS.decay # Default 1e-5
fcn = lambda step: 1./(1. + decay*step)
scheduler = LambdaLR(optimizer, lr_lambda=fcn)
for epoch in range(1, num_epochs+1):
###################
# TRAIN the model #
###################
model.train() # prep model for training
for batch, (images, targets) in enumerate(train_loader, 1):
# Load images and targets to device
images = images.to(device)
targets = targets.to(device)
# Clear gradients
optimizer.zero_grad()
# Forward pass
outputs = model(images)
# Calculate loss
l2_reg = compute_l2_reg(model,model_name)
loss = F.mse_loss(outputs, targets) + l2_reg
# Backward pass
loss.backward()
# Update weights
optimizer.step()
# Decay Learning Rate
scheduler.step()
# Record training loss
train_losses.append(loss.item())
######################
# VALIDATE the model #
######################
model.eval() # prep model for evaluation
for images, targets in valid_loader:
images = images.to(device)
targets = targets.to(device)
# Forward pass:
outputs = model(images)
# Calculate loss
loss = F.mse_loss(outputs, targets)
# Record validation loss
valid_losses.append(loss.item())
# Print training/validation statistics
# Calculate average loss over an epoch
train_loss = np.average(train_losses)
valid_loss = np.average(valid_losses)
avg_train_losses.append(train_loss)
avg_valid_losses.append(valid_loss)
epoch_len = len(str(num_epochs))
print_msg = (f'[{epoch:>{epoch_len}}/{num_epochs:>{epoch_len}}] ' +
f'train_loss: {train_loss:.5f} ' +
f'valid_loss: {valid_loss:.5f}')
print(print_msg)
# Clear lists to track next epoch
train_losses = []
valid_losses = []
# Early_stopping needs the validation loss to check if it has decresed,
# and if it has, it will make a checkpoint of the current model
early_stopping(valid_loss, model)
if early_stopping.early_stop:
print("Early stopping")
break
# Load the last checkpoint with the best model
# (returned by early_stopping call)
model.load_state_dict(torch.load('checkpoint.pt'))
print('Training completed and model saved.')
return model, avg_train_losses, avg_valid_losses
其中weight_decay = 1e-4
和Early Stop是用于检查验证损失并存储验证错误最少的模型的功能。
我尝试在PyTorch中模拟与Keras中相同的手册和默认初始化,您可以在我的模型代码中看到它。
我还试图精确再现与Keras内核正则化器中相同的正则化。