python中的神经网络:决策/分类总是给出0.5

时间:2016-02-10 12:26:50

标签: python neural-network classification

首先,我想说我是一个蟒蛇初学者,也是神经网络的新手。当我读到它时,我非常兴奋,并认为我从头开始设置一些代码(见下面的代码)。

但不知怎的,我的代码运行不正常。我想有一些重大的错误(在算法和编程中?)。但我现在找不到它们。

因此,在手写笔记中,您可以看到我的系统(以及一些公式)。我想解决一个决策问题,我有X =(x1,x2)和y(0或1)形式的数据。

我的网络有一个由3个神经元和一个输出层组成的隐藏层。 作为一个激活函数,我使用sigmoid,对于损失,我使用交叉熵(就像bernoulli的对数似然,我猜?)

神经元采用加权输入W.X +偏差并返回0,1之间的标量。

对于学习过程,我尝试使用向后传播。所以我只计算了导数dLoss / dparams并多次应用链规则。为了不在索引表示法中创建所有内容,我尝试使用numpy来处理矩阵等。

也许有人直接看到我做错的事情? (除了编程错误:D)

Handwritten notes 1/2 Handwritten notes 2/2

#!/usr/bin/python
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

## create random data set for decision problem
np.random.seed(0) #fixed seed to reproduce results
X, y = datasets.make_moons(20, noise=0.20) # lists containing the Data
plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral) # plot it
plt.show() # show plot; proceeds when plot is closed

## initialize model parameters
W1 = np.random.uniform(-0.5,0.5,[3,2]) # hidden layer weights (3 x 2)   matrix
b1 = np.random.uniform(-1,1,[3])   # bias for neurons in hidden layer
W2 = np.random.uniform(-0.5,0.5,[1,3]) # weights for output layer (1 x 3)
b2 = np.random.uniform(-1,1,[1]) # bias for output neuron

# collecting parameters in model dict
model = {"W1" : W1, "W2" : W2, "b1" : b1, "b2" : b2}

## the activation function
# can also return the derivative
def sigmoid(x,derivative = False):
    if derivative == True:
        # derivative; np.multiply multiplies element-wise
        # needed if x is tensor-like object
        return np.multiply(sigmoid(x), (1 - sigmoid(x)))
    else:
        return 1.0/(1.0 + np.exp(-x))

## moving forward in the network for a single data point
# and returns a dict with necessary information
def move_forward(model, DataX):
    W1 = model["W1"] # extract model params from dict to make it better readable
    W2 = model["W2"]
    b1 = model["b1"]
    b2 = model["b2"]
    t1 = np.dot(W1,DataX) + b1 # weighted input for hidden layer (here 3-dim object)
    phi = sigmoid(t1) # evaluate activation function
    phiP = sigmoid(t1, True) # derivative (needed for moving backward "learning")
    t2 = np.dot(W2,phi) + b2 # weighted input for output layer (1-dim object)
    sig = sigmoid(t2) # evaluate final output
    sigP = sigmoid(t2, True) # derivative
    forward = {"phi" : phi,"phiP" : phiP, # dict collecting the output
             "sig" : sig, "sigP" : sigP}
    return forward

## moving backward for a single data point
def move_backward(forward, model, DataX):
    W1 = model["W1"]
    W2 = model["W2"]
    b1 = model["b1"]
    b2 = model["b2"]    
    phi = forward["phi"]
    phiP = forward["phiP"]
    sig = forward["sig"]
    sigP = forward["sigP"]
    #not the full deltaWs / deltabs; multiplied by the rest in "update_model"
    dW2 = sigP * phi # part from "derivative chain" roughly: dsig/dt2 dt2 / dW2
    db2 = sigP # analogue
    temp = np.multiply(W2,phiP) # multiplied element wise
    dW1 = sigP * np.outer(temp, DataX) # outer product since: (W2 * phi)_j x_i
    db1 = sigP * np.outer(temp, [1]) # analogue
    backward = {"dW1": dW1, "dW2": dW2, "db1": db1, "db2": db2}
    return backward

## part of the loss function; here for one data point
# returns also the derivative for the learning process
def loss(DataY, PredictionY, derivative = False):
    if derivative == True:
        return DataY / PredictionY - (1.0 - DataY) / (1.0 - PredictionY)
    log_likelihood = DataY * np.log(PredictionY) + (1.0 - DataY) * np.log(1.0 - PredictionY) 
    return log_likelihood

## updating model parameters
## epsilon is a small parameter regulating the learning
def update_model(DataSet,model, epsilon):
    DataX = DataSet[0]
    DataY = DataSet[1]
    total_loss = 0
    dW1_total = 0
    dW2_total = 0
    db1_total = 0
    db2_total = 0
    beta = 0
    W1 = model["W1"]
    W2 = model["W2"]
    b1 = model["b1"]
    b2 = model["b2"]
    # iterating over full data set
    for i in range(len(DataX)):
        forward = move_forward(model, DataX[i])
        backward = move_backward(forward, model, DataX[i])
        sig = forward["sig"]        
        total_loss += loss(DataY[i],sig)
        beta += loss(DataY[i],sig, True)
        dW1_total += backward["dW1"]
        dW2_total += backward["dW2"]
        db1_total += backward["db1"]
        db2_total += backward["db2"]
    total_loss *= -1.0/len(DataX) # the total loss
    beta *= -1.0/len(DataX) # the derivative of dloss/dsig
    ## setting updated model params
    W1_new = W1 - epsilon * beta * dW1_total
    W2_new = W2 - epsilon * beta * dW2_total
    b1_new = b1 - epsilon * beta * np.squeeze(np.asarray(db1_total)) 
    b2_new = b2 - epsilon * beta *  db2_total
    model_updated = {"W1": W1_new, "W2": W2_new, "b1": b1_new,
                     "b2": b2_new, "loss": total_loss}
    return model_updated

## train the model with a given data set N times
def train_model(DataSet,model, epsilon, N, print_state = False):
    for i in range(N):        
        model = update_model(DataSet,model, epsilon)
        if print_state == True:
            if i % 100 == 0:
                print(model)
                print("loss = " , model["loss"])
    print(model)
    return model


## call the training function and store the output
model_new = train_model([X,y],model, 0.01, 1000, True)
## check result with data point in the training set
move_forward(model_new,X[0])

# Note: Hm, somehow I always get sig = 0.5 (roughly). And the loss
# does not get smaller than 0.68
# I guess there must be several mistakes

0 个答案:

没有答案