使用反向传播在MNIST数据集上添加第二个隐藏层时的精度损失

时间:2017-03-21 14:08:03

标签: python tensorflow mnist

我试图用手计算的反向传播(而不是版本中内置的tensorflow)向工作的神经网络添加第二个隐藏层。

我尝试添加第二层,而不是一层ReLU和一层softmax。结果是完全丧失准确性,从92.3%降至8.5%,没有任何改进。

起初我认为我的错误是在我的反向传播计算中,但我双重检查并通过评论我写的内容并利用

来运行它

step1 = tf.train.GradientDescentOptimizer(0.1).minimize(cost)

代替下面列出的反向传播部分。这也导致精度低。为了进一步混淆我,如果我使用AdamOptimizer(0.0001)代替Gradient Descent,代码确实可以按预期改善整体结果,准确度为97%。

为什么我在这里失去准确性?

反向传播

d_z_3 = tf.multiply(cost, sigmaprime(z_3)) #OK?
d_b_3 = d_z_3
d_w_3 = tf.matmul(tf.transpose(a_2), d_z_3)
d_a_2= tf.matmul(d_z_3, tf.transpose(w_3))
d_z_2 = tf.multiply(d_a_2, d_relu(z_2))#change
d_b_2 = d_z_2
d_w_2 = tf.matmul(tf.transpose(a_1), d_z_2)
d_a_1 = tf.matmul(d_z_2, tf.transpose(w_2))
d_z_1 = tf.multiply(d_a_1, d_relu(z_1)) #change
d_b_1 = d_z_1
d_w_1 = tf.matmul(tf.transpose(a_0), d_z_1)
eta = tf.constant(0.5)

step1 = [
    tf.assign(w_1,
            tf.subtract(w_1, tf.multiply(eta, d_w_1)))
  , tf.assign(b_1,
            tf.subtract(b_1, tf.multiply(eta,
                               tf.reduce_mean(d_b_1, axis=[0]))))
  , tf.assign(w_2,
            tf.subtract(w_2, tf.multiply(eta, d_w_2)))
  , tf.assign(b_2,
            tf.subtract(b_2, tf.multiply(eta,
                               tf.reduce_mean(d_b_2, axis=[0]))))
  , tf.assign(w_3,
            tf.subtract(w_3, tf.multiply(eta, d_w_3)))
  , tf.assign(b_3,
            tf.subtract(b_3, tf.multiply(eta,
                               tf.reduce_mean(d_b_3, axis=[0]))))
]

整个代码

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot = True)
a_0 = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])

middle = 100

#randomize the weights
w_1 = tf.Variable(tf.random_normal([784, middle] , stddev=0.01))
b_1 = tf.Variable(tf.zeros([1, middle]))
w_2 = tf.Variable(tf.random_normal([middle, middle] , stddev=0.01))
b_2 = tf.Variable(tf.zeros([1, middle]))
w_3 = tf.Variable(tf.random_normal([middle, 10] , stddev=0.01))
b_3 = tf.Variable(tf.zeros([1, 10] ))

#forward propagation (with  relu 2x, softmax)
z_1 = tf.add(tf.matmul(a_0, w_1), b_1)
a_1 = tf.nn.relu(z_1)
z_2 = tf.add(tf.matmul(a_1, w_2), b_2)
a_2 = tf.nn.relu(z_2)
z_3 = tf.add(tf.matmul(a_2, w_3), b_3)
a_3 = tf.nn.softmax(z_3) 
diff = tf.subtract(a_3, y)
cost = tf.multiply(diff, diff)

#backpropagation
d_z_3 = tf.multiply(cost, sigmaprime(z_3)) #OK?
d_b_3 = d_z_3
d_w_3 = tf.matmul(tf.transpose(a_2), d_z_3)
d_a_2= tf.matmul(d_z_3, tf.transpose(w_3))
d_z_2 = tf.multiply(d_a_2, d_relu(z_2))#change
d_b_2 = d_z_2
d_w_2 = tf.matmul(tf.transpose(a_1), d_z_2)
d_a_1 = tf.matmul(d_z_2, tf.transpose(w_2))
d_z_1 = tf.multiply(d_a_1, d_relu(z_1)) #change
d_b_1 = d_z_1
d_w_1 = tf.matmul(tf.transpose(a_0), d_z_1)
eta = tf.constant(0.5)

step1 = [
    tf.assign(w_1,
            tf.subtract(w_1, tf.multiply(eta, d_w_1)))
  , tf.assign(b_1,
            tf.subtract(b_1, tf.multiply(eta,
                               tf.reduce_mean(d_b_1, axis=[0]))))
  , tf.assign(w_2,
            tf.subtract(w_2, tf.multiply(eta, d_w_2)))
  , tf.assign(b_2,
            tf.subtract(b_2, tf.multiply(eta,
                               tf.reduce_mean(d_b_2, axis=[0]))))
  , tf.assign(w_3,
            tf.subtract(w_3, tf.multiply(eta, d_w_3)))
  , tf.assign(b_3,
            tf.subtract(b_3, tf.multiply(eta,
                               tf.reduce_mean(d_b_3, axis=[0]))))
]

#end backpropagation section

acct_mat = tf.equal(tf.argmax(a_3, 1), tf.argmax(y, 1))
acct_res = tf.reduce_sum(tf.cast(acct_mat, tf.float32))

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

for i in range(10000):
    batch_xs, batch_ys = mnist.train.next_batch(50)
    sess.run(step1, feed_dict = {a_0: batch_xs, y : batch_ys})
    if i % 1000 == 0:
        res = sess.run(acct_res, feed_dict =
                       {a_0: mnist.test.images[:1000],
                        y : mnist.test.labels[:1000]})
        print(res/1000)

0 个答案:

没有答案