Training set size = 200000
input layer size = 784
hidden layer size = 50
output layer size = 10
def cost(theta,X,y,lamb):
#get theta1 and theta2 from unrolled theta vector
th1 = (theta[0:(hiddenLayerSize*(inputLayerSize+1))].reshape((inputLayerSize+1,hiddenLayerSize))).T
th2 = (theta[(hiddenLayerSize*(inputLayerSize+1)):].reshape((hiddenLayerSize+1,outputLayerSize))).T
#matrices to store gradient of theta1 &theta2
th1_grad = np.zeros(th1.shape)
th2_grad = np.zeros(th2.shape)
I = np.identity(outputLayerSize,int)
Y = np.zeros((realTrainSetSize ,outputLayerSize))
#get Y[i] to the size of output Layer
for i in range(0,realTrainSetSize ):
Y[i] = I[y[i]]
#add bais unit in each training example and perform forward prop and backprop
A1 = np.hstack([np.ones((realTrainSetSize ,1)),X])
Z2 = A1 @ (th1.T)
A2 = np.hstack([np.ones((len(Z2),1)),sigmoid(Z2)])
Z3 = A2 @ (th2.T)
H = A3 = sigmoid(Z3)
penalty = (lamb/(2*trainSetSize))*(sum(sum(np.delete(th1,0,1)**2))+ sum(sum(np.delete(th2,0,1)**2)) )
J = (1/2)*sum(sum( np.multiply(-Y,log(H)) - np.multiply((1-Y),log(1-H)) ))
sigma3 = A3 - Y;
sigma2 = np.multiply(sigma3@theta2,sigmoidGradient(np.hstack([np.ones((len(Z2),1)),Z2])))
sigma2 = np.delete(sigma2,0,1)
delta_1 = sigma2.T @ A1 #getting dimension mismatch error
delta_2 = sigma3.T @ A2
#calculation of gradient of theta1 and theta2
th1_grad = np.divide(delta_1,trainSetSize)+(lamb/trainSetSize)*(np.hstack([np.zeros((len(th1),1)) , np.delete(th1,0,1)]))
th2_grad = np.divide(delta_2,trainSetSize)+(lamb/trainSetSize)*(np.hstack([np.zeros((len(th2),1)) , np.delete(th2,0,1)]))
#unroll gradients of theta1 and theta2
theta_grad = np.concatenate(((th1_grad.T).ravel(),(th2_grad.T).ravel()))
return (J,theta_grad)