我一直在梳理这段代码一周,试图弄清楚为什么我的成本函数会增加,如下图所示。降低学习率确实有帮助但很少。任何人都可以发现为什么成本函数没有按预期工作?
我意识到CNN会更好,但我仍然想知道为什么这个简单的网络失败了。 请帮助:)
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
mnist = input_data.read_data_sets("MNIST_DATA/",one_hot=True)
def createPlaceholders():
xph = tf.placeholder(tf.float32, (784, None))
yph = tf.placeholder(tf.float32, (10, None))
return xph, yph
def init_param(layers_dim):
weights = {}
L = len(layers_dim)
for l in range(1,L):
weights['W' + str(l)] = tf.get_variable('W' + str(l), shape=(layers_dim[l],layers_dim[l-1]), initializer= tf.contrib.layers.xavier_initializer())
weights['b' + str(l)] = tf.get_variable('b' + str(l), shape=(layers_dim[l],1), initializer= tf.zeros_initializer())
return weights
def forward_prop(X,L,weights):
parameters = {}
parameters['A0'] = tf.cast(X,tf.float32)
for l in range(1,L-1):
parameters['Z' + str(l)] = tf.add(tf.matmul(weights['W' + str(l)], parameters['A' + str(l-1)]), weights['b' + str(l)])
parameters['A' + str(l)] = tf.nn.relu(parameters['Z' + str(l)])
parameters['Z' + str(L-1)] = tf.add(tf.matmul(weights['W' + str(L-1)], parameters['A' + str(L-2)]), weights['b' + str(L-1)])
return parameters['Z' + str(L-1)]
def compute_cost(ZL,Y):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf.cast(Y,tf.float32), logits = ZL))
return cost
def randomMiniBatches(X,Y,minibatch_size):
m = X.shape[1]
shuffle = np.random.permutation(m)
temp_X = X[:,shuffle]
temp_Y = Y[:,shuffle]
num_complete_minibatches = int(np.floor(m/minibatch_size))
mini_batches = []
for batch in range(num_complete_minibatches):
mini_batches.append((temp_X[:,batch*minibatch_size: (batch+1)*minibatch_size], temp_Y[:,batch*minibatch_size: (batch+1)*minibatch_size]))
mini_batches.append((temp_X[:,num_complete_minibatches*minibatch_size:], temp_Y[:,num_complete_minibatches*minibatch_size:]))
return mini_batches
def model(X, Y, layers_dim, learning_rate = 0.001, num_epochs = 20, minibatch_size = 64):
tf.reset_default_graph()
costs = []
xph, yph = createPlaceholders()
weights = init_param(layers_dim)
ZL = forward_prop(xph, len(layers_dim), weights)
cost = compute_cost(ZL,yph)
optimiser = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(num_epochs):
minibatches = randomMiniBatches(X,Y,minibatch_size)
epoch_cost = 0
for b, mini in enumerate(minibatches,1):
mini_x, mini_y = mini
_,c = sess.run([optimiser,cost],feed_dict={xph:mini_x,yph:mini_y})
epoch_cost += c
print('epoch: ',epoch+1,'/ ',num_epochs)
epoch_cost /= len(minibatches)
costs.append(epoch_cost)
plt.plot(costs)
print(costs)
X_train = mnist.train.images.T
n_x = X_train.shape[0]
Y_train = mnist.train.labels.T
n_y = Y_train.shape[0]
layers_dim = [n_x,10,n_y]
model(X_train, Y_train, layers_dim)
答案 0 :(得分:1)
您不必深入了解如何绘制迷你批次:我认为问题在于您出于某种原因将xph
和yph
的轴1定义为批量维度(并相应地进给)网络的计算图形要求轴0是通常所做的批量维度。
因此,您的前向传播实际上是沿批量维度执行的,这没有意义。