Question

我试图编写一个可以学习XOR输出的神经网络。我的基础是你在Andrew Ng的Coursera课程练习4中所写的那个。

问题在于，无论出于何种原因， theta1 和 theta2 的权重始终为0，除了那些使用的权重每层中的偏向神经元。

我不知道为什么会发生这种情况，我无法看到我的代码有什么问题。我的代码如下。

编辑，问题解决了
我已经意识到，我的 lambda 的值也设置得很高，这意味着我有太多的正规化。糟糕！

sigmoid[x_] := 1/(1 + Exp[-x]);
siggrad[x_] := sigmoid[x]*(1 - sigmoid[x]);
costfunction[x_, y_, theta1_, theta2_, lambda_] := Module[
    {k, x1, z2, z3, a2, a3, m, n, ones, J, t1, t2, i, j,
    delta3, delta2, theta1grad, theta2grad},
    {m, n} = Dimensions[x];
    ones = ConstantArray[1, m];
    x1 = Insert[x // Transpose, ones, 1] // Transpose;
    z2 = x1.theta1;
    a2 = sigmoid[z2];
    a2 = Insert[a2 // Transpose, ones, 1] // Transpose;
    z3 = a2.theta2;
    a3 = sigmoid[z3];

    {k} = Dimensions[y[[1]]];
    J = 0;

    For[i = 1, i <= k, i++,
        J = J + (({-y[[All, i]]}.Transpose[
            Log[{a3[[All, i]]}]]) - ({1 - y[[All, i]]}.Transpose[
            Log[{1 - a3[[All, i]]}]]))/m;
     ];

    t1 = 0;
    t2 = 0;
    For[i = 2, i <= Dimensions[theta1][[1]], i++,
        For[j = 1, j <= Dimensions[theta1][[2]], j++,
            t1 = t1 + theta1[[i, j]]^2;
        ]
    ];
    For[i = 2, i <= Dimensions[theta2][[1]], i++,
        For[j = 1, j <= Dimensions[theta2][[2]], j++,
            t2 = t2 + theta2[[i, j]]^2;
        ]
    ];
    J = J + lambda*(t1 + t2)/(2*m);
    delta3 = ConstantArray[0, {1, 1}];
    theta2grad = ConstantArray[0, Dimensions[theta2]];
    theta1grad = ConstantArray[0, Dimensions[theta1]];

    For[i = 1, i <= m, i++,
        delta3 = a3[[i]] - y[[i]];
        delta2 = (theta2.delta3)*siggrad[Insert[z2[[i, All]], 1, 1]];
        theta2grad = theta2grad + Transpose[{a2[[i, All]]}].{delta3};
        theta1grad = theta1grad + Transpose[{x1[[i, All]]}].{delta2[[2 ;;]]};
    ];
    theta1grad = theta1grad/m;
    theta2grad = theta2grad/m;
    theta1grad[[2 ;;, All]] = 
    theta1grad[[2 ;;, All]] + (lambda/m)*theta1[[2 ;;, All]];
    theta2grad[[2 ;;, All]] = theta2grad[[2 ;;, All]] + (lambda/m)*theta2[[2 ;;, All]];
    {J, theta1grad, theta2grad}
]
update[theta1_, theta1grad_, lambda_] := theta1 - lambda*theta1grad
predict[x_, theta1_, theta2_] := Module[{m, n, ones, x1, z2, a2, z3, a3},
    {m, n} = Dimensions[x];
    ones = ConstantArray[1, m];
    x1 = Insert[x // Transpose, ones, 1] // Transpose;
    z2 = x1.theta1;
    a2 = sigmoid[z2];
    a2 = Insert[a2 // Transpose, ones, 1] // Transpose;
    z3 = a2.theta2;
    a3 = sigmoid[z3];
    a3
]
x = {
    {0.0, 0.0},
    {1.0, 1.0},
    {1, 0},
    {0, 1}
    };
y = {
    {1, 0},
    {1, 0},
    {0, 1},
    {0, 1}
    };
theta1 = RandomReal[{-.12, .12}, {3, 4}];
theta2 = RandomReal[{-.12, .12}, {5, 2}];
theta1         // MatrixForm
theta2         // MatrixForm
lambda = 0.75; // <------------------------------------------ REGULARISATION
For[i = 1, i <= 1000, i++,
    {J, theta1grad, theta2grad} = costfunction[x, y, theta1, theta2, lambda];
    theta1 = update[theta1, theta1grad, lambda];
    theta2 = update[theta2, theta2grad, lambda];
    If[Mod[i, 100] == 0, Print[J];];
];
theta1 // MatrixForm
theta2 // MatrixForm

XOR的神经网络收敛到0.5

0 个答案: