我编写了一个带有2个隐藏层的多层感知器。 我有一个似乎可以正常工作的单层版本,但是当我添加额外的层时,损失似乎不会低于1。
我想要实现的是MNIST数据集的二进制分类方法。更确切地说,我正在尝试使用此MLP来区分3和7。我正在使用S型激活函数和反向传播来调整权重。
我怀疑我对反向传播进行编码的方式可能有问题。
num_examples = len(X) # training set size
nn_input_dim = 784 # input layer dimensionality
nn_output_dim = 2 # output layer dimensionality
# Gradient descent parameters
epsilon = 0.005 # learning rate for gradient descent
def calculate_loss(model):
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
# Forward propagation to calculate predictions
z1 = X.dot(W1) + b1
a1 = sigmoid(z1)
z2 = a1.dot(W2) + b2
a2 = sigmoid(z2)
z3 = a2.dot(W3) + b3
exp_scores = np.exp(z3)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Calculate the loss
corect_logprobs = -np.log(probs[range(num_examples), y])
data_loss = np.sum(corect_logprobs)
return 1./num_examples * data_loss
def predict(model, X):
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
#Forward propagation
z1 = X.dot(W1) + b1
a1 = sigmoid(z1)
z2 = a1.dot(W2) + b2
a2 = sigmoid(z2)
z3 = a2.dot(W3) + b3
exp_scores = np.exp(z3)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
return np.argmax(probs, axis=1)
def build_model(nn_hdim1, nn_hdim2, num_passes=20000, print_loss=False):
# Initialize the params that need to be learned to random values
np.random.seed(0)
W1 = np.random.randn(nn_input_dim, nn_hdim1) / np.sqrt(nn_input_dim)
b1 = np.zeros((1, nn_hdim1))
W2 = np.random.randn(nn_hdim1, nn_hdim2) / np.sqrt(nn_hdim1)
b2 = np.zeros((1, nn_hdim2))
W3 = np.random.randn(nn_hdim2, nn_output_dim) / np.sqrt(nn_hdim2)
b3 = np.zeros((1, nn_output_dim))
# This is what we return at the end
model = {}
loss = []
for i in range(0, num_passes):
# Forward propagation
z1 = X.dot(W1) + b1
a1 = sigmoid(z1)
z2 = a1.dot(W2) + b2
a2 = sigmoid(z2)
z3 = a2.dot(W3) + b3
exp_scores = np.exp(z3)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Back propagation
delta4 = probs
delta4[range(num_examples), y] -= 1
dW3 = (a2.T).dot(delta4)
db3 = np.sum(delta4, axis=0, keepdims=True)
delta3 = delta4.dot(W3.T) * (1 - sigmoid(a2))
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
delta2 = delta3.dot(W2.T) * (1 - sigmoid(a1))
dW1 = np.dot(X.T, delta2)
db1 = np.sum(delta2, axis=0)
# Gradient descent parameter update
W1 += -epsilon * dW1
b1 += -epsilon * db1
W2 += -epsilon * dW2
b2 += -epsilon * db2
W3 += -epsilon * dW3
b3 += -epsilon * db3
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1,
'W2': W2, 'b2': b2,
'W3': W3, 'b3': b3}
if print_loss and i % 500 == 0:
loss.append(calculate_loss(model))
print("Loss after iter. %i: %f" %(i, calculate_loss(model)))
return model, loss
model, loss = build_model(3, 3, num_passes=10000, print_loss=True)
这是我得到的输出:
Loss after iter. 500: 2.115385
Loss after iter. 1000: 3.981505
Loss after iter. 1500: 4.851937
Loss after iter. 2000: 2.286764
Loss after iter. 2500: 0.650758
Loss after iter. 3000: 0.454743
Loss after iter. 3500: 2.034308
Loss after iter. 4000: 2.221124
Loss after iter. 4500: 0.486550
Loss after iter. 5000: 1.803266
Loss after iter. 5500: 2.318486
Loss after iter. 6000: 1.594879
Loss after iter. 6500: 3.081962
Loss after iter. 7000: 2.340241
Loss after iter. 7500: 3.693953
Loss after iter. 8000: 1.618758
Loss after iter. 8500: 2.328891
Loss after iter. 9000: 2.265693
Loss after iter. 9500: 3.503290
我希望在4500迭代之后(损失为.4),模型会不断改进,但不会发生。
起初,似乎步幅太大,并且回归超出了收敛的“谷”,但与通常用于此的方法相比,我认为0.005很小。