我正在用Python构建一个基本的3层神经网络。编写渐变函数后,我继续使用数值梯度对其进行渐变检查。在获得较大的相对差异后,我展开了两个权重矩阵的梯度并将它们并排比较。
Function Gradient Numerical Gradient
-0.000968788380809 0.0
0.0153540197907 0.0153540197889
-0.00584391679274 -0.00584391679048
-0.00490359558077 -0.00490359558514
-0.00171892592537 -0.0017189259216
0.00913024106334 0.00913024106319
-0.0182154767069 -0.0182154767092
0.0152611324409 0.01526113244
-0.00373505297372 -0.00373505297135
-0.00513225994728 -0.00513225994814
-0.00531954399401 -0.00531954399641
-0.0185748801227 -0.0185748801163
0.00745186105851 0.00745186105267
0.0134566626927 0.0134566626908
0.0251548691426 0.0251548691388
0.00609388350562 0.00609388350226
-0.00471176815719 -0.00471176815564
0.0113580721225 0.0113580721228
0.00465172663488 0.00465172663944
-0.0221326283708 -0.02213262837
0.300007655583 -0.300007655583 <-diverges, corresponding to theta2
0.155638694282 -0.15345321819
0.147747817305 -0.149026829224
0.150703152382 -0.172330417252
0.156307235611 -0.116975643856
0.136898763375 -0.170081036297
0.0621121242042 -0.0621121242372
0.0442762464937 -0.0187338352431
0.0489123689979 -0.00938236375481
0.0244392582651 -0.0465061209964
0.0237741996575 -0.028319115235
0.0313594790974 -0.0330473942922
0.106306327946 -0.106306327941
0.0348751481828 -0.0704775747806
0.0303373211657 -0.0756744476749
0.0633094699759 -0.0461971224763
0.0524239030728 -0.0477244101571
0.0633274024777 -0.0397657392082
Relative Difference:
6.61473694017
每个列表中的前20个元素对应于第一权重矩阵的梯度,而其余18个元素对应于第二权重矩阵的梯度。从我所看到的情况来看,似乎错误发生在列表中的最后18个元素(因此是theta2矩阵渐变)中,其中函数渐变开始不同于&#34;正确的&#34;数值梯度。这也会导致scipy.optimize.fmin_cg给我以下内容:
警告:由于精度损失,无法实现所需的错误。
任何帮助将不胜感激!以下是相关代码:
def sigmoid(z):
return 1 / (1+np.exp(z))
def sigmoid_gradient(z):
return sigmoid(z)*(1-sigmoid(z))
def randInitializeWeights(layer_in, layer_out):
matrix = np.zeros((layer_out, 1 + layer_in))
epsilon_init = 0.12
matrix = np.random.rand(layer_out, 1+layer_in) * 2 * epsilon_init -epsilon_init
return matrix
def gradient(theta, *args):
X, y, num_inputs, num_hidden_units, num_labels, lamb = args
m = len(X)
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1)))
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1))
theta1_grad = np.zeros(theta1.shape)
theta2_grad = np.zeros(theta2.shape)
delta1 = np.zeros(theta1.shape)
delta2 = np.zeros(theta2.shape)
for t in range(0, m):
vec_y = np.zeros(num_labels)
vec_y[y[t]] = 1
vec_y = vec_y[:, np.newaxis]
#feedforward to compute all the neuron activations
a_1 = np.r_[[1], X[t]]
a_1 = a_1[:, np.newaxis]
z_2 = np.dot(theta1, a_1)
a_2 = np.vstack([1, sigmoid(z_2)])
z_3 = np.dot(theta2, a_2)
a_3 = sigmoid(z_3)
#error for output nodes
del3 = a_3 - vec_y
#error for hidden nodes
del2 = np.multiply(np.dot(theta2.T, del3), sigmoid_gradient(np.vstack([1, z_2])))
#remove bias unit
del2 = del2[1:]
#accumulate gradient
delta1 = delta1 + del2*a_1.T
delta2 = delta2 + del3*a_2.T
#no need to regularize the first column
theta1_grad[:, 0] = (1/m)*delta1[:, 0]
theta2_grad[:, 0] = (1/m)*delta2[:, 0]
#regularize the rest
theta1_grad[:, 1:] = ((1/m) * delta1[:, 1:]) + (lamb/m)*theta1[:, 1:]
theta2_grad[:, 1:] = ((1/m) * delta2[:, 1:]) + (lamb/m)*theta2[:, 1:]
#unroll
grad = np.hstack([theta1_grad.ravel(), theta2_grad.ravel()])
return grad
def gradientChecking(lamb):
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
theta2 = randInitializeWeights(hidden_layer_size, num_labels)
X = np.random.rand(m, input_layer_size)
y = np.array([1, 2, 0, 1, 2])
nn_params = np.hstack([theta1.ravel(), theta2.ravel()])
#calculate gradient with function
grad = gradient(nn_params, X, y, input_layer_size, hidden_layer_size, num_labels, lamb)
#calculate numerical gradient
num_grad = computeNumericalGradient(lambda theta: computeCost(theta, X, y, input_layer_size, hidden_layer_size, num_labels, lamb), nn_params)
print('Function Gradient', 'Numerical Gradient')
for i in range(len(grad)):
print(grad[i], num_grad[i])
diff = np.linalg.norm(num_grad-grad)/np.linalg.norm(num_grad+grad)
print('Relative Difference: ')
print(diff)
def computeNumericalGradient(J, theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 0.0001
for p in range(1, np.size(theta)):
perturb[p] = e
loss1 = J(theta - perturb)
loss2 = J(theta + perturb)
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad
答案 0 :(得分:1)
您的sigmoid函数有错误。它应该是这样的:
def sigmoid(z):
return 1 / (1+np.exp(-z))
我对反向传播算法的实现有点困惑。我会在没有for
循环的情况下完成。
您没有发布computeCost
,因此我对其进行了编程并检查了渐变。就我而言,两列都是相同的:
('Function Gradient', 'Numerical Gradient')
(-0.0087363416123043425, 0.0)
(0.017468375248392107, 0.0174683752529603)
(-0.0016267134050363559, -0.0016267134039793518)
(0.0018882373947080224, 0.0018882373997719526)
(-0.0063531428795779391, -0.0063531428762253483)
(0.0029882213493977773, 0.0029882213481435826)
(0.014295787205089885, 0.014295787205131916)
(-0.026668095974979808, -0.026668095973736428)
(0.0043373799514851595, 0.0043373799440971084)
(0.0063740837472641377, 0.0063740837497050506)
(0.0027102260448642525, 0.0027102260435896142)
(0.0067009063282609839, 0.0067009063298151261)
(-0.0029645476578591843, -0.0029645476562478734)
(-0.012000477453137556, -0.012000477451756808)
(-0.020065071389262716, -0.020065071393293721)
(0.010308693441913186, 0.010308693438876304)
(-0.0015996484140612609, -0.0015996484115099463)
(-0.0086037766244218914, -0.0086037766244828617)
(-0.0099431361329477934, -0.0099431361344493041)
(0.0062574996404342166, 0.0062574996406716821)
(0.30213488769328123, 0.3021348876908192)
(0.14900524972537924, 0.14900524972549789)
(0.13305168538400619, 0.13305168538479961)
(0.16730920742910549, 0.16730920743279754)
(0.14245586995768528, 0.14245586995365045)
(0.15465244296463604, 0.15465244296519742)
(0.10813908901043021, 0.10813908900342284)
(0.040844058224880242, 0.04084405822446513)
(0.040566215206120269, 0.040566215204762557)
(0.036451467449020114, 0.036451467448905817)
(0.065664340475228455, 0.065664340476168093)
(0.070753692265581092, 0.07075369226283712)
(0.088651862157018618, 0.088651862166777562)
(0.028272897964677978, 0.028272897965031518)
(0.026876928049457398, 0.026876928049812676)
(0.056512225949437798, 0.056512225949933992)
(0.051775047342360533, 0.051775047342772496)
(0.025689087137289929, 0.025689087135294386)
Relative Difference:
0.00878484310135
这是我的代码:
import numpy as np
def sigmoid(z):
return 1 / (1+np.exp(-z))
def sigmoid_gradient(z):
return sigmoid(z)*(1-sigmoid(z))
def randInitializeWeights(layer_in, layer_out):
matrix = np.zeros((layer_out, 1 + layer_in))
epsilon_init = 0.12
matrix = np.random.rand(layer_out, 1+layer_in) * 2 * epsilon_init -epsilon_init
return matrix
def gradient(theta, *args):
X, y, num_inputs, num_hidden_units, num_labels, lamb = args
m = len(X)
y_bin = np.zeros((m, num_labels))
for i in range(m):
y_bin[i, y[i]] = 1
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1))) #5x4
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1)) #3x6
theta1_grad = np.zeros(theta1.shape)
theta2_grad = np.zeros(theta2.shape)
delta1 = np.zeros(theta1.shape)
delta2 = np.zeros(theta2.shape)
#forward
a_1 = np.hstack((np.ones((m, 1)), X)) #5x4
z_2 = np.dot(a_1, theta1.transpose()) #5x5
a_2 = sigmoid(z_2) #5x5
a_2 = np.hstack((np.ones((m, 1)), a_2)) #5x6
z_3 = np.dot(a_2, theta2.transpose()) #5x3
h = sigmoid(z_3) #5x3
#backward
delta3 = h - y_bin #5x3
delta2 = np.dot(delta3, theta2[:, 1:num_hidden_units+1]) * sigmoid_gradient(z_2) #5x5
D1 = np.dot(delta2.transpose(), a_1) #5x4
D2 = np.dot(delta3.transpose(), a_2) #3x6
theta1_grad = D1/m #5x4
theta2_grad = D2/m #3x6
#regularization
theta1_grad[:, 1:num_inputs+1] = theta1_grad[:, 1:num_inputs+1] +lamb/m* theta1[:, 1:num_inputs+1]
theta2_grad[:, 1:num_hidden_units+1] = theta2_grad[:, 1:num_hidden_units+1] +lamb/m* theta2[:, 1:num_hidden_units+1]
#unroll
grad = np.hstack([theta1_grad.ravel(), theta2_grad.ravel()])
return grad
def gradientChecking(lamb):
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
theta2 = randInitializeWeights(hidden_layer_size, num_labels)
X = np.random.rand(m, input_layer_size)
y = np.array([1, 2, 0, 1, 2])
nn_params = np.hstack([theta1.ravel(), theta2.ravel()])
#calculate gradient with function
grad = gradient(nn_params, X, y, input_layer_size, hidden_layer_size, num_labels, lamb)
#calculate numerical gradient
num_grad = computeNumericalGradient(lambda theta: computeCost(theta, X, y, input_layer_size, hidden_layer_size, num_labels, lamb), nn_params)
print('Function Gradient', 'Numerical Gradient')
for i in range(len(grad)):
print(grad[i], num_grad[i])
diff = np.linalg.norm(num_grad-grad)/np.linalg.norm(num_grad+grad)
print('Relative Difference: ')
print(diff)
def computeCost(theta, X, y, num_inputs, num_hidden_units, num_labels, lamb):
m = len(X)
y_bin = np.zeros((m, num_labels))
for i in range(m):
y_bin[i, y[i]] = 1
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1))) #5x4
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1)) #3x6
a_1 = np.hstack((np.ones((m, 1)), X)) #5x4
z_2 = np.dot(a_1, theta1.transpose()) #5x5
a_2 = sigmoid(z_2) #5x5
a_2 = np.hstack((np.ones((m, 1)), a_2)) #5x6
z_3 = np.dot(a_2, theta2.transpose()) #5x3
h = sigmoid(z_3)
cost = np.sum(-y_bin * np.log(h) - (1-y_bin) * np.log(1-h))/m
#regularization
theta1_sq = theta1[:, 1:num_inputs+1] * theta1[:, 1:num_inputs+1];
theta2_sq = theta2[:, 1:num_hidden_units+1] * theta2[:, 1:num_hidden_units+1];
cost = cost + lamb/(2.0*m)*(np.sum(theta1_sq) + np.sum(theta2_sq))
return cost
def computeNumericalGradient(J, theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 0.0001
for p in range(1, np.size(theta)):
perturb[p] = e
loss1 = J(theta - perturb)
loss2 = J(theta + perturb)
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad
gradientChecking(1.0)