我试图编写一个可以学习XOR输出的神经网络。我的基础是你在Andrew Ng的Coursera课程练习4中所写的那个。
问题在于,无论出于何种原因, theta1
和 theta2
的权重始终为0,除了那些使用的权重每层中的偏向神经元。
我不知道为什么会发生这种情况,我无法看到我的代码有什么问题。我的代码如下。
编辑,问题解决了
我已经意识到,我的 lambda
的值也设置得很高,这意味着我有太多的正规化。糟糕!
sigmoid[x_] := 1/(1 + Exp[-x]);
siggrad[x_] := sigmoid[x]*(1 - sigmoid[x]);
costfunction[x_, y_, theta1_, theta2_, lambda_] := Module[
{k, x1, z2, z3, a2, a3, m, n, ones, J, t1, t2, i, j,
delta3, delta2, theta1grad, theta2grad},
{m, n} = Dimensions[x];
ones = ConstantArray[1, m];
x1 = Insert[x // Transpose, ones, 1] // Transpose;
z2 = x1.theta1;
a2 = sigmoid[z2];
a2 = Insert[a2 // Transpose, ones, 1] // Transpose;
z3 = a2.theta2;
a3 = sigmoid[z3];
{k} = Dimensions[y[[1]]];
J = 0;
For[i = 1, i <= k, i++,
J = J + (({-y[[All, i]]}.Transpose[
Log[{a3[[All, i]]}]]) - ({1 - y[[All, i]]}.Transpose[
Log[{1 - a3[[All, i]]}]]))/m;
];
t1 = 0;
t2 = 0;
For[i = 2, i <= Dimensions[theta1][[1]], i++,
For[j = 1, j <= Dimensions[theta1][[2]], j++,
t1 = t1 + theta1[[i, j]]^2;
]
];
For[i = 2, i <= Dimensions[theta2][[1]], i++,
For[j = 1, j <= Dimensions[theta2][[2]], j++,
t2 = t2 + theta2[[i, j]]^2;
]
];
J = J + lambda*(t1 + t2)/(2*m);
delta3 = ConstantArray[0, {1, 1}];
theta2grad = ConstantArray[0, Dimensions[theta2]];
theta1grad = ConstantArray[0, Dimensions[theta1]];
For[i = 1, i <= m, i++,
delta3 = a3[[i]] - y[[i]];
delta2 = (theta2.delta3)*siggrad[Insert[z2[[i, All]], 1, 1]];
theta2grad = theta2grad + Transpose[{a2[[i, All]]}].{delta3};
theta1grad = theta1grad + Transpose[{x1[[i, All]]}].{delta2[[2 ;;]]};
];
theta1grad = theta1grad/m;
theta2grad = theta2grad/m;
theta1grad[[2 ;;, All]] =
theta1grad[[2 ;;, All]] + (lambda/m)*theta1[[2 ;;, All]];
theta2grad[[2 ;;, All]] = theta2grad[[2 ;;, All]] + (lambda/m)*theta2[[2 ;;, All]];
{J, theta1grad, theta2grad}
]
update[theta1_, theta1grad_, lambda_] := theta1 - lambda*theta1grad
predict[x_, theta1_, theta2_] := Module[{m, n, ones, x1, z2, a2, z3, a3},
{m, n} = Dimensions[x];
ones = ConstantArray[1, m];
x1 = Insert[x // Transpose, ones, 1] // Transpose;
z2 = x1.theta1;
a2 = sigmoid[z2];
a2 = Insert[a2 // Transpose, ones, 1] // Transpose;
z3 = a2.theta2;
a3 = sigmoid[z3];
a3
]
x = {
{0.0, 0.0},
{1.0, 1.0},
{1, 0},
{0, 1}
};
y = {
{1, 0},
{1, 0},
{0, 1},
{0, 1}
};
theta1 = RandomReal[{-.12, .12}, {3, 4}];
theta2 = RandomReal[{-.12, .12}, {5, 2}];
theta1 // MatrixForm
theta2 // MatrixForm
lambda = 0.75; // <------------------------------------------ REGULARISATION
For[i = 1, i <= 1000, i++,
{J, theta1grad, theta2grad} = costfunction[x, y, theta1, theta2, lambda];
theta1 = update[theta1, theta1grad, lambda];
theta2 = update[theta2, theta2grad, lambda];
If[Mod[i, 100] == 0, Print[J];];
];
theta1 // MatrixForm
theta2 // MatrixForm