我使用NumPy在Python中编写了一个反向传播神经网络,用于矩阵计算和批量更新。它可以很好地学习像XOR这样的二进制函数,但是当我在iris
数据集(在sklearn.datasets
中)中使用one-vs-all(我的目标函数为y = iris.target == 1
)进行训练时,它会在将所有1或全部-1放在输出上。我已经尝试在[0.01,20],[3,20]节点中的隐藏层大小以及高达5万个时期中学习速率而没有任何改进。
以下是NN的重要代码。 _sigmoid
是numpy的tanh函数,_dsigmoid
是它的衍生物。我真的很感激任何帮助!
def __init__(self, n_input, n_hidden, n_output):
self.n_input = n_input + 1
self.n_hidden = n_hidden
self.n_output = n_output
self.w1 = np.random.normal(scale=0.7, size=(self.n_input*self.n_hidden)).reshape(self.n_input, self.n_hidden)
self.w2 = np.random.normal(scale=0.7, size=(self.n_hidden*self.n_output)).reshape(self.n_hidden, self.n_output)
self.output_activation = np.zeros(n_output)
self.hidden_activation = np.zeros(n_hidden)
self.input_activation = np.zeros(n_input)
def feed_forward(self):
"""
Update output vector created by feed-forward propagation of input activations
"""
self.hidden_activation = self._sigmoid(np.dot(self.input_activation, self.w1))
self.output_activation = self._sigmoid(np.dot(self.hidden_activation, self.w2))
def back_propagate(self, target, alpha):
output_error = target - self.output_activation
output_delta = output_error * self._dsigmoid(self.output_activation)
hidden_error = np.dot(output_delta, self.w2.T)
hidden_delta = hidden_error * self._dsigmoid(self.hidden_activation)
self.w2 += alpha * (np.dot(self.hidden_activation.T, output_delta))
self.w1 += alpha * (np.dot(self.input_activation.T, hidden_delta))
def train(self, data, target, alpha, epochs=50):
m = data.shape[0]
# add bias to input
X = np.ones((m, self.n_input))
X[:, 1:] = data
# turn target into a column vector
target = target[:, np.newaxis]
for epoch in range(epochs):
self.input_activation = X
self.feed_forward()
self.back_propagate(target, alpha)
def predict(self, data):
m = data.shape[0]
self.input_activation = np.ones((m, self.n_input))
self.input_activation[:, 1:] = data
self.feed_forward()
return self.output_activation
答案 0 :(得分:0)
这对我有用:
import numpy as np
import sklearn.datasets
import math
class NN():
def __init__(self, n_input, n_hidden, n_output):
self.n_input = n_input + 1
self.n_hidden = n_hidden
self.n_output = n_output
self.w1 = np.random.normal(scale=0.7, size=(self.n_input*self.n_hidden)).reshape(self.n_input, self.n_hidden)
self.w2 = np.random.normal(scale=0.7, size=(self.n_hidden*self.n_output)).reshape(self.n_hidden, self.n_output)
self.output_activation = np.zeros(n_output)
self.hidden_activation = np.zeros(n_hidden)
self.input_activation = np.zeros(n_input)
def _sigmoid(self,x):
return(1/(1+math.e**(-x))) #sigmoid
#return(np.tanh(x.astype(float))) #tanh
def _dsigmoid(self,x):
return(x*(1-x)) #sigmoid
#return(1-x**2) #tanh
def feed_forward(self):
"""
Update output vector created by feed-forward propagation of input activations
"""
self.hidden_activation = self._sigmoid(np.dot(self.input_activation, self.w1))
self.output_activation = self._sigmoid(np.dot(self.hidden_activation, self.w2))
def back_propagate(self, target, alpha):
output_error = (target - self.output_activation)
output_delta = output_error * self._dsigmoid(self.output_activation)
hidden_error = np.dot(output_delta, self.w2.T)
hidden_delta = hidden_error * self._dsigmoid(self.hidden_activation)
self.w2 += alpha * (np.dot(self.hidden_activation.T, output_delta))
self.w1 += alpha * (np.dot(self.input_activation.T, hidden_delta))
def train(self, data, target, alpha, epochs=50):
m = data.shape[0]
# add bias to input
X = np.ones((m, self.n_input))
X[:, 1:] = data
# turn target into a column vector
target = target[:, np.newaxis]
for epoch in range(epochs):
self.input_activation = X
self.feed_forward()
self.back_propagate(target, alpha)
def predict(self, data):
m = data.shape[0]
self.input_activation = np.ones((m, self.n_input))
self.input_activation[:, 1:] = data
self.feed_forward()
return self.output_activation
iris = sklearn.datasets.load_iris()
data = iris['data']
targets = iris['target']
for i,t in enumerate(targets):
if t!=1:
targets[i] = 0
network = NN(4,3,1)
network.train(data,targets,0.01,epochs=10000)
print(network.predict(data))
print(targets)
我将函数更改为Sigmoid,因为当目标介于0和1之间时,它更有意义。问题可能出在数据准备中或其他您未共享的问题上。不过,这似乎不太可能,因为我无法使其与tanh一起使用,并且结果根据隐藏的神经元的数量而有很大差异。我认为您应该查看您的反向传播代码,然后尝试进行梯度检查。