pytorch中的代码是here,我只使用了 vgg19 arch。
为了使cifar10数据集在caffe和pytorch中预处理相同,我删除了main.py
中的所有变换,但toTensor()
。我在pytorch中发现cifar10数据集的范围是[0,1],但是caffe是[0,255],所以我将比例缩小到1/255。任何额外的预处理都会被取消,以使事情变得更容易。
这是我的caffe net定义原型:
layer {
name: "data"
type: "Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
scale: 0.00392156862745
}
data_param {
source: "/home/snk/Documents/digits-jobs/20180106-135745-4cac/train_db"
batch_size: 128
backend: LMDB
}
}
layer {
name: "data"
type: "Data"
top: "data"
top: "label"
include {
phase: TEST
}
transform_param {
scale: 0.00392156862745
}
data_param {
source: "/home/snk/Documents/digits-jobs/20180106-135745-4cac/val_db"
batch_size: 128
backend: LMDB
}
}
layer {
name: "conv1_1"
type: "Convolution"
bottom: "data"
top: "conv1_1"
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu1_1"
type: "ReLU"
bottom: "conv1_1"
top: "conv1_1"
}
layer {
name: "conv1_2"
type: "Convolution"
bottom: "conv1_1"
top: "conv1_2"
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu1_2"
type: "ReLU"
bottom: "conv1_2"
top: "conv1_2"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1_2"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2_1"
type: "Convolution"
bottom: "pool1"
top: "conv2_1"
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu2_1"
type: "ReLU"
bottom: "conv2_1"
top: "conv2_1"
}
layer {
name: "conv2_2"
type: "Convolution"
bottom: "conv2_1"
top: "conv2_2"
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu2_2"
type: "ReLU"
bottom: "conv2_2"
top: "conv2_2"
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2_2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv3_1"
type: "Convolution"
bottom: "pool2"
top: "conv3_1"
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu3_1"
type: "ReLU"
bottom: "conv3_1"
top: "conv3_1"
}
layer {
name: "conv3_2"
type: "Convolution"
bottom: "conv3_1"
top: "conv3_2"
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu3_2"
type: "ReLU"
bottom: "conv3_2"
top: "conv3_2"
}
layer {
name: "conv3_3"
type: "Convolution"
bottom: "conv3_2"
top: "conv3_3"
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu3_3"
type: "ReLU"
bottom: "conv3_3"
top: "conv3_3"
}
layer {
name: "conv3_4"
type: "Convolution"
bottom: "conv3_3"
top: "conv3_4"
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu3_4"
type: "ReLU"
bottom: "conv3_4"
top: "conv3_4"
}
layer {
name: "pool3"
type: "Pooling"
bottom: "conv3_4"
top: "pool3"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv4_1"
type: "Convolution"
bottom: "pool3"
top: "conv4_1"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu4_1"
type: "ReLU"
bottom: "conv4_1"
top: "conv4_1"
}
layer {
name: "conv4_2"
type: "Convolution"
bottom: "conv4_1"
top: "conv4_2"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu4_2"
type: "ReLU"
bottom: "conv4_2"
top: "conv4_2"
}
layer {
name: "conv4_3"
type: "Convolution"
bottom: "conv4_2"
top: "conv4_3"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu4_3"
type: "ReLU"
bottom: "conv4_3"
top: "conv4_3"
}
layer {
name: "conv4_4"
type: "Convolution"
bottom: "conv4_3"
top: "conv4_4"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu4_4"
type: "ReLU"
bottom: "conv4_4"
top: "conv4_4"
}
layer {
name: "pool4"
type: "Pooling"
bottom: "conv4_4"
top: "pool4"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv5_1"
type: "Convolution"
bottom: "pool4"
top: "conv5_1"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu5_1"
type: "ReLU"
bottom: "conv5_1"
top: "conv5_1"
}
layer {
name: "conv5_2"
type: "Convolution"
bottom: "conv5_1"
top: "conv5_2"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu5_2"
type: "ReLU"
bottom: "conv5_2"
top: "conv5_2"
}
layer {
name: "conv5_3"
type: "Convolution"
bottom: "conv5_2"
top: "conv5_3"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu5_3"
type: "ReLU"
bottom: "conv5_3"
top: "conv5_3"
}
layer {
name: "conv5_4"
type: "Convolution"
bottom: "conv5_3"
top: "conv5_4"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu5_4"
type: "ReLU"
bottom: "conv5_4"
top: "conv5_4"
}
layer {
name: "pool5"
type: "Pooling"
bottom: "conv5_4"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "drop6"
type: "Dropout"
bottom: "pool5"
top: "drop_pool5"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc6"
type: "InnerProduct"
bottom: "drop_pool5"
top: "fc6"
inner_product_param {
num_output: 512
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "fc6"
top: "fc6"
}
layer {
name: "drop7"
type: "Dropout"
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc7"
type: "InnerProduct"
bottom: "fc6"
top: "fc7"
inner_product_param {
num_output: 512
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "fc7"
top: "fc7"
}
layer {
name: "fc8"
type: "InnerProduct"
bottom: "fc7"
top: "fc8"
inner_product_param {
num_output: 10
weight_filler {
type: "xavier"
}
bias_filler {
type: "xavier"
}
}
}
layer {
name: "acc"
type: "Accuracy"
bottom: "fc8"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc8"
bottom: "label"
top: "loss"
}
这是我的solver.prototxt
test_iter: 79
test_interval: 391
base_lr: 0.05
display: 40
max_iter: 117300
lr_policy: "step"
gamma: 0.5
momentum: 0.9
weight_decay: 0.0005
stepsize: 11730
snapshot: 3910
snapshot_prefix: "snapshot"
solver_mode: GPU
net: "train_val.prototxt"
solver_type: SGD
事实上,pytorch能够以非常快的速度训练模型,一个纪元后的acc约为20%,但在caffe中,损失总是在2.3XXX左右(约-log(0.1),随机猜测损失),我怀疑是因为不同的权重初始化,所以我将caffe的filler.hpp
xavier更改为有效的backprop'(即U(-std,std),std = 1 /(sqrt(fan_in))),但它没有用。
现在,唯一的区别是偏差初始化方法,在pytorch中它使用权重的fan_in,但在caffe中我认为它使用output_num作为fan_in(因为它的形状是[1,N],N是输出神经元的数量,在filler.hpp中,它使用blob.count()/ blob.num()作为xavier的fan_in。
谁能帮帮我?我想如果所有的配置都相同,那么培训过程几乎是一样的,但它打破了我的意见。