import numpy as np
from numpy import exp
from numpy import random
from numpy import log
from numpy import size
from numpy import amax
from numpy import multiply as mul
from numpy import vstack
from matplotlib import pyplot as plot
def sigmoid(x):
return 1.0 / (1+ exp(-x) )
def backPro(X,y,alpha):
a1=X.T
n1=size(X,1)
n2=10
m=size(a1,1)
a1 = vstack((np.ones((1,m)), a1))
K=size(y,0)
Jlist=[]
s=100
lit=0
#initialization (radonize)
theta1=random.rand(n2,n1+1)
theta2=random.rand(K,n2+1)
while lit<10000:
z2 = theta1 * a1
a2 = sigmoid(z2)
a2 = vstack((np.ones((1,m)), a2))
z3 = theta2 * a2
H = sigmoid(z3)
J = (-1.0/m) * np.sum(mul(y,log(H)) + mul(1-y,log(1-H)))
Jlist.append(J)
sigma3 = H-y
sigma2 = mul( (theta2.T * sigma3), mul(a2 , (1-a2)) )
delta1 = sigma2[1:] * a1.T * (1.0/m)
delta2 = sigma3 * a2.T * (1.0/m)
#do the gradient descent
theta1 -= (delta1 * alpha)
theta2 -= (delta2 * alpha)
#update the s
#s=max( amax(np.abs(delta1)),amax(np.abs(delta2)) )
lit+=1
plot.scatter(range(0,len(Jlist)),Jlist,1)
plot.scatter(range(0,len(Jlist[-100:])), Jlist[-100:],1)
print "The J is "+str(J)
print "The S is "+str(s)
print "lit is "+str(lit)
return theta1,theta2
def cost(a1,y,h):
m=size(a1,0)
Kmatrix=np.sum( mul(y, log(h)) + mul((1-y), log(1-h)) )
J= (-1.0/m)* Kmatrix
return J
def predict(X, theta1, theta2):
X = X.T
m = size(X,1)
X = vstack((np.ones((1,m)),X))
hidden = sigmoid(theta1 * X)
hidden = vstack((np.ones((1,m)),hidden))
print sigmoid(theta2 * hidden)
return np.argmax( sigmoid(theta2 * hidden),0 ).T + 1
我想用这个多分类器来分类已经作为数字传输的手写数字(原始图像只是简单的二进制像素,这意味着1表示像素是纯黑色,否则是纯白色)
实际上,X以numpy.matrix的形式存储数据,其维数为n1 * m(row * collon),这里n1表示输入层节点的数量,m是数据集的数量,和y存储标签(结果从1到9),也是numpy.matrix的形式,并且具有K * m的尺寸(K是这里9的标签数量,m也是数据集的数量)。顺便说一下,我已经取消了backPro中s的计算。
当我将X,y和alpha(学习率)放入backPro并获得theta1和theta2时,我然后使用这两个参数使用训练集计算输出(是的,它只是训练集),但是发现所有数据集的预测是相同的,这里我的意思完全相同,我发现它令人难以置信.. 这是结果:
from simpleneu import predict as pre
pre(X,theta1,theta2)
[[0.10106717 0.10106717 0.10106717 ... 0.10106717 0.10106717 0.10106717]
[0.10169492 0.10169492 0.10169492 ... 0.10169492 0.10169492 0.10169492]
[0.09981168 0.09981168 0.09981168 ... 0.09981168 0.09981168 0.09981168]
...
[0.09918393 0.09918393 0.09918393 ... 0.09918393 0.09918393 0.09918393]
[0.09730069 0.09730069 0.09730069 ... 0.09730069 0.09730069 0.09730069]
[0.09918393 0.09918393 0.09918393 ... 0.09918393 0.09918393 0.09918393]]
Out[99]:
matrix([[2],
[2],
[2],
...,
[2],
[2],
[2]])
并且输出的维数是K * m,并且在predict()函数中我打印出sigmoid(theta2 * hidden),然后将它们作为具有最大潜在的标签的标签