从TensorFlow网站教程开始,我正在尝试创建一个非常简单的神经网络来预测基于Adience Benchmark的性别。按照我的两次尝试的代码:
def model1(x, y_, data_size):
W = tf.Variable(tf.zeros([data_size, 1]))
b = tf.Variable(tf.zeros([1]))
y = tf.matmul(x, W) + b
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
return tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy), y, cross_entropy
def model2(x, y_):
def fully_connected(input, size):
weights = tf.get_variable( 'weights',
shape = [input.get_shape()[1], size],
initializer = tf.contrib.layers.xavier_initializer()
)
biases = tf.get_variable( 'biases',
shape = [size],
initializer=tf.constant_initializer(0.0)
)
return tf.matmul(input, weights) + biases
def model_pass(input):
with tf.variable_scope('hidden'):
hidden = fully_connected(input, size = 100)
relu_hidden = tf.nn.relu(hidden)
with tf.variable_scope('out'):
prediction = fully_connected(relu_hidden, size = 1)
return prediction
predictions = model_pass(x)
loss = tf.reduce_mean(tf.square(predictions - y_))
optimizer = tf.train.MomentumOptimizer(
learning_rate = 0.01,
momentum = 0.9,
use_nesterov = True
).minimize(loss)
return optimizer, predictions, loss
这里是训练模型的代码(批量大小为128,图像已加载并按照this paper的建议裁剪为227x227):
graph = tf.Graph()
with graph.as_default():
path_batch, label_batch = input_pipeline(people_path, batch_size, None, True)
label_batch = extract_feature(label_batch,1)
label_batch = tf.reshape(label_batch,[batch_size,1])
data_batch = path_to_image_crop(path_batch, os.path.dirname(people_path), image_prefix, image_dimension)
data_batch = tf.reshape(data_batch,[batch_size, data_size])
x = tf.placeholder(tf.float32, [None, data_size])
y_ = tf.placeholder(tf.float32, [None, 1])
train_step, y, loss = model1(x, y_, data_size)
#train_step, y, loss = model2(x, y_)
with tf.Session(graph = graph) as session:
session.run(tf.global_variables_initializer())
session.run(tf.local_variables_initializer())
coord = tf.train.Coordinator ()
threads = tf.train.start_queue_runners (coord = coord)
for i in range(num_epochs):
batch_xs, batch_ys = session.run([data_batch, label_batch])
batch_xs = images_as_float(batch_xs, batch_size, data_size)
p,l,_ = session.run([y, loss, train_step], feed_dict={x: batch_xs, y_: batch_ys})
print('%d: %s -> %s %s' % (i, l, p[i % batch_size], batch_ys[i % batch_size])) # 1 -> male, 2 -> female
if (i == 0): print(batch_xs)
coord.request_stop ()
coord.join (threads)
第一个模型为每次迭代产生一个损失等于零,第二个模型产生第二个:
0: 0.0 -> [ 0.] [2]
[[ 0.09019608 0.0745098 0.07058824 ..., 0.14509804 0.08627451
0.05882353]
[ 0.03529412 0.03137255 0.02352941 ..., 0.05882353 0.04313725
0.03921569]
[ 0. 0. 0. ..., 0. 0. 0. ]
...,
[ 0.27843137 0.2627451 0.32156863 ..., 0.17647059 0.21176471
0.24705882]
[ 0.19607843 0.03137255 0.07058824 ..., 0.19215686 0.0745098
0.08235294]
[ 0.29803922 0.20392157 0.15686275 ..., 0.70588235 0.45490196
0.41568627]]
1: 0.0 -> [ 8368.69335938] [1]
2: 0.0 -> [ 9435.07910156] [1]
3: 0.0 -> [ 6342.55175781] [2]
4: 0.0 -> [ 24146.79492188] [2]
5: 0.0 -> [ 38010.859375] [2]
6: 0.0 -> [ 28421.10546875] [1]
7: 0.0 -> [ 38900.63671875] [2]
8: 0.0 -> [ 12061.45605469] [2]
9: 0.0 -> [ 62396.5390625] [2]
10: 0.0 -> [ 44290.84765625] [2]
11: 0.0 -> [ 26015.90234375] [2]
12: 0.0 -> [ 57388.23046875] [1]
13: 0.0 -> [ 119108.203125] [1]
14: 0.0 -> [ 131051.671875] [1]
15: 0.0 -> [ 131854.78125] [2]
16: 0.0 -> [ 159839.875] [2]
17: 0.0 -> [ 128897.90625] [1]
18: 0.0 -> [ 61369.3359375] [1]
19: 0.0 -> [ 190607.71875] [1]
....
经过一些迭代后,第二个模型出现了分歧:
0: 2.14689 -> [ 0.14331065] [1]
[[ 0.03921569 0.07843137 0.04705882 ..., 0.34117647 0.29803922
0.36078431]
[ 0.16078431 0.11764706 0.13333333 ..., 0.17254902 0.11372549
0.10196078]
[ 0.18823529 0.15294118 0.1254902 ..., 0.90196078 0.84705882
0.84313725]
...,
[ 0.50196078 0.36078431 0.29803922 ..., 0.6 0.40784314
0.34901961]
[ 0.58039216 0.40392157 0.38039216 ..., 0.6745098 0.61176471
0.55294118]
[ 0.17254902 0.29803922 0.14509804 ..., 0.16470588 0.15686275
0.23921569]]
1: 1.38878e+06 -> [ 1075.10534668] [1]
2: 17212.8 -> [-68.56524658] [1]
3: 2431.18 -> [-46.70772934] [2]
4: 4.38701e+11 -> [ 670822.0625] [2]
5: 5.75069e+08 -> [-23979.0625] [1]
6: 1.10681e+09 -> [-33267.28515625] [1]
7: 1.66428e+09 -> [-40794.125] [1]
8: 2.17327e+09 -> [-46616.8828125] [1]
9: 2.58284e+09 -> [-50820.1875] [2]
10: 2.86359e+09 -> [-53511.0703125] [2]
11: 3.00476e+09 -> [-54814.17578125] [1]
12: 3.01057e+09 -> [-54867.203125] [2]
我知道卷积模型比我实现的模型效果更好,但我的直觉是模型1和模型2收敛的精度不高,而不是这个奇怪的结果。这是我的假设错误还是我实施模型或培训步骤的方式存在错误?
完整代码为on github
感谢您的任何建议!
最好的问候
詹卢卡