假设我们有一个这样的层:
layer {
name: "fully-connected"
type: "InnerProduct"
bottom: "bottom"
top: "top"
inner_product_param {
num_output: 1
}
}
输出是batch_size x 1.在几篇论文中(顶部为link1页面3图片,或顶部为link2第4页)我看到他们在最终得出用于像素预测的2D图像。如何将其转换为2D图像?我在考虑重塑或去卷积,但我无法弄清楚它是如何工作的。一个简单的例子很有帮助
更新:我的输入图像是304x228,而我的ground_truth(深度图像)是75x55。
################# Main net ##################
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "norm1"
type: "LRN"
bottom: "conv1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "norm1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "norm2"
type: "LRN"
bottom: "conv2"
top: "norm2"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "norm2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "conv4"
type: "Convolution"
bottom: "conv3"
top: "conv4"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu4"
type: "ReLU"
bottom: "conv4"
top: "conv4"
}
layer {
name: "conv5"
type: "Convolution"
bottom: "conv4"
top: "conv5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu5"
type: "ReLU"
bottom: "conv5"
top: "conv5"
}
layer {
name: "pool5"
type: "Pooling"
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fc6"
type: "InnerProduct"
bottom: "pool5"
top: "fc6"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relufc6"
type: "ReLU"
bottom: "fc6"
top: "fc6"
}
layer {
name: "drop6"
type: "Dropout"
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc7"
type: "InnerProduct"
bottom: "fc6"
top: "fc7"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4070
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
type: "Reshape"
name: "reshape"
bottom: "fc7"
top: "fc7_reshaped"
reshape_param {
shape { dim: 1 dim: 1 dim: 55 dim: 74 }
}
}
layer {
name: "deconv1"
type: "Deconvolution"
bottom: "fc7_reshaped"
top: "deconv1"
convolution_param {
num_output: 64
kernel_size: 5
pad: 2
stride: 1
#group: 256
weight_filler {
type: "bilinear"
}
bias_term: false
}
}
#########################
layer {
name: "conv6"
type: "Convolution"
bottom: "data"
top: "conv6"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 63
kernel_size: 9
stride: 2
pad: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "conv6"
top: "conv6"
}
layer {
name: "pool6"
type: "Pooling"
bottom: "conv6"
top: "pool6"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
########################
layer {
name: "concat"
type: "Concat"
bottom: "deconv1"
bottom: "pool6"
top: "concat"
concat_param {
concat_dim: 1
}
}
layer {
name: "conv7"
type: "Convolution"
bottom: "concat"
top: "conv7"
convolution_param {
num_output: 64
kernel_size: 5
pad: 2
stride: 1
weight_filler {
type: "gaussian"
std: 0.011
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "conv7"
top: "conv7"
relu_param{
negative_slope: 0.01
engine: CUDNN
}
}
layer {
name: "conv8"
type: "Convolution"
bottom: "conv7"
top: "conv8"
convolution_param {
num_output: 64
kernel_size: 5
pad: 2
stride: 1
weight_filler {
type: "gaussian"
std: 0.011
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu8"
type: "ReLU"
bottom: "conv8"
top: "conv8"
relu_param{
negative_slope: 0.01
engine: CUDNN
}
}
layer {
name: "conv9"
type: "Convolution"
bottom: "conv8"
top: "conv9"
convolution_param {
num_output: 1
kernel_size: 5
pad: 2
stride: 1
weight_filler {
type: "gaussian"
std: 0.011
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu9"
type: "ReLU"
bottom: "conv9"
top: "result"
relu_param{
negative_slope: 0.01
engine: CUDNN
}
}
日志:
I1108 19:34:57.239722 4277 data_layer.cpp:41] output data size: 1,1,228,304
I1108 19:34:57.243340 4277 data_layer.cpp:41] output data size: 1,1,55,74
I1108 19:34:57.247392 4277 net.cpp:150] Setting up conv1
I1108 19:34:57.247407 4277 net.cpp:157] Top shape: 1 96 55 74 (390720)
I1108 19:34:57.248191 4277 net.cpp:150] Setting up pool1
I1108 19:34:57.248196 4277 net.cpp:157] Top shape: 1 96 27 37 (95904)
I1108 19:34:57.253263 4277 net.cpp:150] Setting up conv2
I1108 19:34:57.253276 4277 net.cpp:157] Top shape: 1 256 27 37 (255744)
I1108 19:34:57.254202 4277 net.cpp:150] Setting up pool2
I1108 19:34:57.254220 4277 net.cpp:157] Top shape: 1 256 13 18 (59904)
I1108 19:34:57.269943 4277 net.cpp:150] Setting up conv3
I1108 19:34:57.269961 4277 net.cpp:157] Top shape: 1 384 13 18 (89856)
I1108 19:34:57.285303 4277 net.cpp:150] Setting up conv4
I1108 19:34:57.285338 4277 net.cpp:157] Top shape: 1 384 13 18 (89856)
I1108 19:34:57.294801 4277 net.cpp:150] Setting up conv5
I1108 19:34:57.294841 4277 net.cpp:157] Top shape: 1 256 13 18 (59904)
I1108 19:34:57.295207 4277 net.cpp:150] Setting up pool5
I1108 19:34:57.295210 4277 net.cpp:157] Top shape: 1 256 6 9 (13824)
I1108 19:34:57.743222 4277 net.cpp:150] Setting up fc6
I1108 19:34:57.743259 4277 net.cpp:157] Top shape: 1 4096 (4096)
I1108 19:34:57.881680 4277 net.cpp:150] Setting up fc7
I1108 19:34:57.881718 4277 net.cpp:157] Top shape: 1 4070 (4070)
I1108 19:34:57.881826 4277 net.cpp:150] Setting up reshape
I1108 19:34:57.881846 4277 net.cpp:157] Top shape: 1 1 55 74 (4070)
I1108 19:34:57.884768 4277 net.cpp:150] Setting up conv6
I1108 19:34:57.885309 4277 net.cpp:150] Setting up pool6
I1108 19:34:57.885327 4277 net.cpp:157] Top shape: 1 63 55 74 (256410)
I1108 19:34:57.885395 4277 net.cpp:150] Setting up concat
I1108 19:34:57.885412 4277 net.cpp:157] Top shape: 1 64 55 74 (260480)
I1108 19:34:57.886759 4277 net.cpp:150] Setting up conv7
I1108 19:34:57.886786 4277 net.cpp:157] Top shape: 1 64 55 74 (260480)
I1108 19:34:57.897269 4277 net.cpp:150] Setting up conv8
I1108 19:34:57.897303 4277 net.cpp:157] Top shape: 1 64 55 74 (260480)
I1108 19:34:57.899129 4277 net.cpp:150] Setting up conv9
I1108 19:34:57.899138 4277 net.cpp:157] Top shape: 1 1 55 74 (4070)
答案 0 :(得分:2)
对于像素明智的预测,最后一个完全连接的图层的num_output值不是1
。它将等于输入图像的w*h
。
是什么让你觉得价值会是1?
修改1 :
以下是link1第3页图中提到的每一层的尺寸:
LAYER OUTPUT DIM [c*h*w]
course1 96*h1*w1 conv layer
course2 256*h2*w2 conv layer
course3 384*h3*w3 conv layer
course4 384*h4*w4 conv layer
course5 256*h5*w5 conv layer
course6 4096*1*1 fc layer
course7 X*1*1 fc layer where 'X' could be interpreted as w*h
为了进一步理解这一点,我们假设我们有一个网络来预测图像的像素。图像大小为10 * 10。因此,fc层的最终输出也将具有尺寸100 * 1 * 1(如同在课程7中)。这可以解释为10 * 10。
现在问题是,1d阵列如何正确预测2d图像。为此,您必须注意使用可能与像素数据相对应的标签计算此输出的损耗。因此,在训练期间,权重将学习预测像素数据。
编辑2:
尝试使用caffe中的draw_net.py
绘制网络,为您提供:
与relu
和conv6
相关联的fc6
图层具有相同的名称,导致绘制图像中的连接复杂。我不确定这是否会在培训期间引起一些问题,但我建议您将其中一个relu层重命名为一个唯一的名称,以避免一些不可预知的问题。
回到你的问题,在完全连接的层之后似乎没有发生上采样。如日志所示:
I1108 19:34:57.881680 4277 net.cpp:150] Setting up fc7
I1108 19:34:57.881718 4277 net.cpp:157] Top shape: 1 4070 (4070)
I1108 19:34:57.881826 4277 net.cpp:150] Setting up reshape
I1108 19:34:57.881846 4277 net.cpp:157] Top shape: 1 1 55 74 (4070)
I1108 19:34:57.884768 4277 net.cpp:150] Setting up conv6
I1108 19:34:57.885309 4277 net.cpp:150] Setting up pool6
I1108 19:34:57.885327 4277 net.cpp:157] Top shape: 1 63 55 74 (256410)
fc7
的输出尺寸为4070 * 1 * 1。这被重新整形为1 * 55 * 74,作为conv6
图层的输入传递。
整个网络的输出在conv9
中生成,其输出维度为1*55*74
,与标签的维度(深度数据)完全相似。
如果我的答案仍然不明确,请确定你感觉上样的地方。
答案 1 :(得分:0)
如果您只需要像传统多层感知器这样的全连接网络,请使用2D blob(shape (N, D)
)并调用InnerProductLayer
。