我尝试实现一个非常简单的RNN
步骤,并将其与Andrew Ng的功能进行比较,看一切是否正常。
但是,我的结果与他的结果不同,但是我所做的只是将批量大小放在第一个维度而不是第二个维度!其他所有内容都是相同的(除了我必须更改乘法顺序,以便正确的维度彼此相邻)。 这是我的实现:
import numpy as np
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=0)
class RNNClass(object):
def __init__(self, vocab_size, outputsize, hidden_state_size=100):
np.random.seed(1)
#our weight size, is determined by hidden_state_size and vocab_size because
# they are multiplied by input and also hidden_state, which ultimately should
# result in the hidden_size
# W1 is multiplied by xt which has the shape (batch, vocabsize)
self.W1 = np.random.randn(vocab_size, hidden_state_size)
# W2 is multiplied by hidden_state, which has the shape h=(batchsize, hiddensize)
# since w1 and w2 are added together, therefore, the dimension of h0.w2
# must have the same shape of the result of xt.w1.
self.W2 = np.random.randn(hidden_state_size, hidden_state_size)
# W3 is multiplied by hidden_state, and it should ultimately have the shape of vocabsize
# h1.w3
self.W3 = np.random.randn(outputsize, hidden_state_size)
# should have size of 1
self.bh = np.random.randn(hidden_state_size)
# should have size of 1
self.bo = np.random.randn(outputsize)
self.outputsize = outputsize
self.hidden_state_size = hidden_state_size
def rnn_cell_foward(self, xt, h0):
"""
Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
activation function.
The input data has dimension D(vocabsize), the hidden state has dimension H(HiddenSize), and we use
a minibatch size of N(Batch_size).
Inputs:
- x: Input data for this timestep, of shape (Batch_size, vocabsize_or_basically_input_dim_size).
- h0: Hidden state from previous timestep, of shape (Batch_size, HiddenSize)
- W1: Weight matrix for input-to-hidden connections, of shape (vocabsize, HiddenSize)
- W2: Weight matrix for hidden-to-hidden connections, of shape (HiddenSize, HiddenSize)
- W3: Weight matrix for hiddent-to-output connections, of shape(vocabsize_or_output_dim_size, hidden_state_size)
- bh: Biases of shape (HiddenSize,)
- bo: Biases of shape (vocabsize_or_output_dim_size,)
"""
h_t = np.tanh(np.dot(xt, self.W1) + np.dot(h0, self.W2) + self.bh)
o_t = softmax(np.dot( h_t, self.W3.T) + self.bo)
return o_t, h_t
,输出为:
vocabsize=3
hidden_state_size=5
outputsize=2
batch=10
Xt = np.random.rand(batch, vocabsize)
h0 = np.zeros(shape=(batch, hidden_state_size))
rnn = RNNClass(vocab_size=vocabsize, outputsize=outputsize, hidden_state_size=hidden_state_size)
yt_pred,a_next = rnn.rnn_cell_foward(Xt, h0)
# so the output looks like andrew's result.
a_next = a_next.transpose(1,0)
yt_pred = yt_pred.transpose(1,0)
print("a_next.shape = ", a_next.shape)
print("yt_pred.shape = ", yt_pred.shape)
print("a_next[4] = ", a_next[4])
print("yt_pred[1] =", yt_pred[1])
输出:
a_next[4] = [0.84867458 0.77846452 0.58705883 0.88028079 0.46130119 0.39808808 0.01003178 0.406457 0.41351936 0.9144255 ]
a_next.shape = (5, 10)
yt_pred[1] = [0.06592572 0.06621226 0.13315296 0.06556298 0.08856467 0.14952982 0.13894541 0.13843746 0.08882247 0.06484625]
yt_pred.shape= (2, 10)
这是他的实现方式:
def rnn_cell_forward(xt, a_prev, parameters):
"""
Implements a single forward step of the RNN-cell as described in Figure (2)
Arguments:
xt -- your input data at timestep "t", numpy array of shape (n_x, m).
a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
ba -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a_next -- next hidden state, of shape (n_a, m)
yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
"""
# Retrieve parameters from "parameters"
Wax = parameters["Wax"]
Waa = parameters["Waa"]
Wya = parameters["Wya"]
ba = parameters["ba"]
by = parameters["by"]
### START CODE HERE ### (≈2 lines)
a_next = np.tanh(np.dot(Wax,xt) + np.dot(Waa,a_prev) + ba)
# compute output of the current cell using the formula given above
yt_pred = softmax(np.dot(Wya,a_next)+by)
### END CODE HERE ###
# store values you need for backward propagation in cache
cache = (a_next, a_prev, xt, parameters)
return a_next, yt_pred, cache
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}
a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
它生成的输出是:
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}
a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", a_next.shape)
print("yt_pred[1] =", yt_pred[1])
print("yt_pred.shape = ", yt_pred.shape)
输出:
a_next[4] = [ 0.59584544 0.18141802 0.61311866 0.99808218 0.85016201 0.99980978 -0.18887155 0.99815551 0.6531151 0.82872037]
a_next.shape = (5, 10)
yt_pred[1] = [ 0.9888161 0.01682021 0.21140899 0.36817467 0.98988387 0.88945212 0.36920224 0.9966312 0.9982559 0.17746526]
yt_pred.shape= (2, 10)
如您所见,我还使用了np.random.seed(1)
,但结果却有所不同。我很困惑,是什么导致这种差异?