Question

编写了一些代码来使用python + numpy实现一个简单的NN，并使用scipy中的fmincg进行优化。认为梯度函数存在一些问题 - 也尝试使用较小的NN。但无法弄清楚出了什么问题。尝试优化 - 不收敛。已查找了许多类似的实现，但找不到任何有意义的差异。非常感谢这些人的一些帮助。提前谢谢！

import numpy as np
from numpy import linalg as la
import time
from scipy import optimize

def sigmoid(x):
    return (1/(1+np.exp(-x)))

def pack_thetas(t1, t2):
    return np.concatenate((t1.reshape(-1), t2.reshape(-1)))

def unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels):
    t1_start = 0
    t1_end = hidden_layer_size * (input_layer_size + 1)
    t1 = thetas[t1_start:t1_end].reshape((hidden_layer_size, input_layer_size + 1))
    t2 = thetas[t1_end:].reshape((num_labels, hidden_layer_size + 1))
    return t1, t2

def cost(params, x, y, lambdap, n, h):
    cost = 0.0
    m, num_labels = np.shape(y)
    '''unroll thetas'''
    t1, t2 = unpack_thetas(params, n, h, num_labels)
    a2 = sigmoid(x.dot(t1.T))
    x2 = np.concatenate((np.ones((m,1)), a2), axis = 1)
    yi = sigmoid(x2.dot(t2.T))

    for i in range(m):
        for j in range(num_labels):
            yout = yi[i, j].copy()
            yin = y[i, j].copy()
            cost -= yin * np.log(yout) + (1-yin)*np.log(1-yout)

    cost += (sum(sum(t1[:,1:] * t1[:,1:])) + sum(sum(t2[:,1:] * t2[:,1:]))) * lambdap / 2
    '''Not considering first column as bias weights are not regularized'''
    return(cost/m)

def grad(params, x, y, lambdap, n, h):
    m, num_labels = np.shape(y)

    '''unroll thetas'''
    t1, t2 = unpack_thetas(params, n, h, num_labels)
    #t1 = np.reshape(params[0:h*(n+1)],(h, n+1))
    #t2 = np.reshape(params[h*(n+1):],(num_labels, h+1))
    t1g = np.zeros(np.shape(t1))
    t2g = np.zeros(np.shape(t2))

    for i in range(m):
        a1 = np.reshape(x[i,:],(1,n+1))
        z2 = np.dot(a1, t1.T)
        a2 = np.concatenate((np.ones((1,1)), sigmoid(z2)), axis = 1)
        yout = sigmoid(np.dot(a2, t2.T))
        error3 = yout - y[i,:]
        gz2 = sigmoid(z2)*(1-sigmoid(z2))
        error2 = np.dot(t2[:,1:].T,error3.T) * (gz2.T)
        t2g = t2g + np.dot(error3.T,a2)
        t1g = t1g + np.dot(error2,a1)

    t1g[:,1:] = (1/m) * t1g[:,1:] + (lambdap/m)*t1[:,1:]
    t2g[:,1:] = (1/m) * t2g[:,1:] + (lambdap/m)*t2[:,1:]

    gradient = pack_thetas(t1g, t2g)
    return(gradient.flatten())


x = np.array(np.loadtxt("x.txt"))
y = np.array(np.loadtxt("y.txt"))
m, n = np.shape(x)

#adding x0 to x
xb = np.concatenate((np.ones((m,1)),x), axis = 1)

#one-hot encoding y
yb = np.zeros((m,10), dtype="f")
for i in range(m):
    yb[i, y[i]-1] = 1 

print("Done loading data...")

h = 25
num_labels = 10
epsilon = 0.12
lambdap = 1

theta1 = np.random.random((h, n+1)) * 2 * epsilon - epsilon
theta2 = np.random.random((num_labels, h+1)) * 2 * epsilon - epsilon
params = pack_thetas(theta1, theta2)
#params = np.concatenate((np.reshape(theta1,((h*(n+1)),1)),np.reshape(theta2,((num_labels*(h+1)),1))),axis=0)

#cost at initial thetas
print(cost(params, xb, yb, 0.0, n, h))
print(cost(params, xb, yb, lambdap, n, h))
optiparams = params.flatten()
optiparams = optimize.fmin_cg(cost, optiparams, fprime=grad, args=(xb, yb, lambdap, n, h), maxiter=100)

#print("Cost at optimum = " + cost(optiparams, xb, yb, lambdap, n, h))
accuracy = 0
t1, t2 = unpack_thetas(optiparams, n, h, num_labels)

a1 = np.reshape(xb,(1,n+1))
a2 = sigmoid(a1.dot(t1.T))
x2 = np.concatenate((np.ones((m,1)), a2), axis = 1)
yi = sigmoid(x2.dot(t2.T))

for i in range(m):
    maxp = np.max(yi[i],axis=0)
    for j in range(num_labels):
        if yi[i,j] == maxp:
            if yb[i,j] == 1:
                accuracy += 1

print(accuracy)

使用python + numpy实现神经网络

0 个答案: