首先,我想说我是一个蟒蛇初学者,也是神经网络的新手。当我读到它时,我非常兴奋,并认为我从头开始设置一些代码(见下面的代码)。
但不知怎的,我的代码运行不正常。我想有一些重大的错误(在算法和编程中?)。但我现在找不到它们。
因此,在手写笔记中,您可以看到我的系统(以及一些公式)。我想解决一个决策问题,我有X =(x1,x2)和y(0或1)形式的数据。
我的网络有一个由3个神经元和一个输出层组成的隐藏层。 作为一个激活函数,我使用sigmoid,对于损失,我使用交叉熵(就像bernoulli的对数似然,我猜?)
神经元采用加权输入W.X +偏差并返回0,1之间的标量。
对于学习过程,我尝试使用向后传播。所以我只计算了导数dLoss / dparams并多次应用链规则。为了不在索引表示法中创建所有内容,我尝试使用numpy来处理矩阵等。
也许有人直接看到我做错的事情? (除了编程错误:D)
Handwritten notes 1/2 Handwritten notes 2/2
#!/usr/bin/python
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
## create random data set for decision problem
np.random.seed(0) #fixed seed to reproduce results
X, y = datasets.make_moons(20, noise=0.20) # lists containing the Data
plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral) # plot it
plt.show() # show plot; proceeds when plot is closed
## initialize model parameters
W1 = np.random.uniform(-0.5,0.5,[3,2]) # hidden layer weights (3 x 2) matrix
b1 = np.random.uniform(-1,1,[3]) # bias for neurons in hidden layer
W2 = np.random.uniform(-0.5,0.5,[1,3]) # weights for output layer (1 x 3)
b2 = np.random.uniform(-1,1,[1]) # bias for output neuron
# collecting parameters in model dict
model = {"W1" : W1, "W2" : W2, "b1" : b1, "b2" : b2}
## the activation function
# can also return the derivative
def sigmoid(x,derivative = False):
if derivative == True:
# derivative; np.multiply multiplies element-wise
# needed if x is tensor-like object
return np.multiply(sigmoid(x), (1 - sigmoid(x)))
else:
return 1.0/(1.0 + np.exp(-x))
## moving forward in the network for a single data point
# and returns a dict with necessary information
def move_forward(model, DataX):
W1 = model["W1"] # extract model params from dict to make it better readable
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
t1 = np.dot(W1,DataX) + b1 # weighted input for hidden layer (here 3-dim object)
phi = sigmoid(t1) # evaluate activation function
phiP = sigmoid(t1, True) # derivative (needed for moving backward "learning")
t2 = np.dot(W2,phi) + b2 # weighted input for output layer (1-dim object)
sig = sigmoid(t2) # evaluate final output
sigP = sigmoid(t2, True) # derivative
forward = {"phi" : phi,"phiP" : phiP, # dict collecting the output
"sig" : sig, "sigP" : sigP}
return forward
## moving backward for a single data point
def move_backward(forward, model, DataX):
W1 = model["W1"]
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
phi = forward["phi"]
phiP = forward["phiP"]
sig = forward["sig"]
sigP = forward["sigP"]
#not the full deltaWs / deltabs; multiplied by the rest in "update_model"
dW2 = sigP * phi # part from "derivative chain" roughly: dsig/dt2 dt2 / dW2
db2 = sigP # analogue
temp = np.multiply(W2,phiP) # multiplied element wise
dW1 = sigP * np.outer(temp, DataX) # outer product since: (W2 * phi)_j x_i
db1 = sigP * np.outer(temp, [1]) # analogue
backward = {"dW1": dW1, "dW2": dW2, "db1": db1, "db2": db2}
return backward
## part of the loss function; here for one data point
# returns also the derivative for the learning process
def loss(DataY, PredictionY, derivative = False):
if derivative == True:
return DataY / PredictionY - (1.0 - DataY) / (1.0 - PredictionY)
log_likelihood = DataY * np.log(PredictionY) + (1.0 - DataY) * np.log(1.0 - PredictionY)
return log_likelihood
## updating model parameters
## epsilon is a small parameter regulating the learning
def update_model(DataSet,model, epsilon):
DataX = DataSet[0]
DataY = DataSet[1]
total_loss = 0
dW1_total = 0
dW2_total = 0
db1_total = 0
db2_total = 0
beta = 0
W1 = model["W1"]
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
# iterating over full data set
for i in range(len(DataX)):
forward = move_forward(model, DataX[i])
backward = move_backward(forward, model, DataX[i])
sig = forward["sig"]
total_loss += loss(DataY[i],sig)
beta += loss(DataY[i],sig, True)
dW1_total += backward["dW1"]
dW2_total += backward["dW2"]
db1_total += backward["db1"]
db2_total += backward["db2"]
total_loss *= -1.0/len(DataX) # the total loss
beta *= -1.0/len(DataX) # the derivative of dloss/dsig
## setting updated model params
W1_new = W1 - epsilon * beta * dW1_total
W2_new = W2 - epsilon * beta * dW2_total
b1_new = b1 - epsilon * beta * np.squeeze(np.asarray(db1_total))
b2_new = b2 - epsilon * beta * db2_total
model_updated = {"W1": W1_new, "W2": W2_new, "b1": b1_new,
"b2": b2_new, "loss": total_loss}
return model_updated
## train the model with a given data set N times
def train_model(DataSet,model, epsilon, N, print_state = False):
for i in range(N):
model = update_model(DataSet,model, epsilon)
if print_state == True:
if i % 100 == 0:
print(model)
print("loss = " , model["loss"])
print(model)
return model
## call the training function and store the output
model_new = train_model([X,y],model, 0.01, 1000, True)
## check result with data point in the training set
move_forward(model_new,X[0])
# Note: Hm, somehow I always get sig = 0.5 (roughly). And the loss
# does not get smaller than 0.68
# I guess there must be several mistakes