Question

我尝试实现一个非常简单的RNN步骤，并将其与Andrew Ng的功能进行比较，看一切是否正常。

但是，我的结果与他的结果不同，但是我所做的只是将批量大小放在第一个维度而不是第二个维度！其他所有内容都是相同的（除了我必须更改乘法顺序，以便正确的维度彼此相邻）。这是我的实现：

import numpy as np

def softmax(x): 
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

class RNNClass(object):
    def __init__(self, vocab_size, outputsize, hidden_state_size=100):

        np.random.seed(1)
        #our weight size, is determined by hidden_state_size and vocab_size because
        # they are multiplied by input and also hidden_state, which ultimately should 
        # result in the hidden_size 
        # W1 is multiplied by xt which has the shape (batch, vocabsize)
        self.W1 = np.random.randn(vocab_size, hidden_state_size)
        # W2 is multiplied by hidden_state, which has the shape h=(batchsize, hiddensize) 
        # since w1 and w2 are added together, therefore, the dimension of h0.w2 
        # must have the same shape of the result of xt.w1. 
        self.W2 = np.random.randn(hidden_state_size, hidden_state_size)
        # W3 is multiplied by hidden_state, and it should ultimately have the shape of vocabsize
        # h1.w3
        self.W3 = np.random.randn(outputsize, hidden_state_size)
        # should have size of 1
        self.bh = np.random.randn(hidden_state_size)
        # should have size of 1
        self.bo = np.random.randn(outputsize)

        self.outputsize = outputsize
        self.hidden_state_size = hidden_state_size

    def rnn_cell_foward(self, xt, h0):
        """
            Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
            activation function.
            The input data has dimension D(vocabsize), the hidden state has dimension H(HiddenSize), and we use
            a minibatch size of N(Batch_size).
            Inputs:
            - x: Input data for this timestep, of shape (Batch_size, vocabsize_or_basically_input_dim_size).
            - h0: Hidden state from previous timestep, of shape (Batch_size, HiddenSize)
            - W1: Weight matrix for input-to-hidden connections, of shape (vocabsize, HiddenSize)
            - W2: Weight matrix for hidden-to-hidden connections, of shape (HiddenSize, HiddenSize)
            - W3: Weight matrix for hiddent-to-output connections, of shape(vocabsize_or_output_dim_size, hidden_state_size)
            - bh: Biases of shape (HiddenSize,)
            - bo: Biases of shape (vocabsize_or_output_dim_size,)

       """

        h_t = np.tanh(np.dot(xt, self.W1) + np.dot(h0, self.W2) + self.bh)
        o_t = softmax(np.dot( h_t, self.W3.T) + self.bo)

        return o_t, h_t

，输出为：

vocabsize=3
hidden_state_size=5
outputsize=2
batch=10
Xt = np.random.rand(batch, vocabsize)
h0 = np.zeros(shape=(batch, hidden_state_size))

rnn = RNNClass(vocab_size=vocabsize, outputsize=outputsize, hidden_state_size=hidden_state_size)
yt_pred,a_next = rnn.rnn_cell_foward(Xt, h0)
# so the output looks like andrew's result.
a_next = a_next.transpose(1,0)
yt_pred = yt_pred.transpose(1,0)

print("a_next.shape = ", a_next.shape)
print("yt_pred.shape = ", yt_pred.shape)
print("a_next[4] = ", a_next[4])
print("yt_pred[1] =", yt_pred[1])

输出：

a_next[4] = [0.84867458  0.77846452  0.58705883  0.88028079  0.46130119 0.39808808  0.01003178  0.406457  0.41351936  0.9144255 ]
a_next.shape = (5, 10)
yt_pred[1] = [0.06592572  0.06621226  0.13315296  0.06556298  0.08856467       0.14952982  0.13894541  0.13843746  0.08882247  0.06484625]
yt_pred.shape= (2, 10)

这是他的实现方式：

def rnn_cell_forward(xt, a_prev, parameters):
    """
    Implements a single forward step of the RNN-cell as described in Figure (2)

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
    """

    # Retrieve parameters from "parameters"
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    ### START CODE HERE ### (≈2 lines)
    a_next = np.tanh(np.dot(Wax,xt) + np.dot(Waa,a_prev) + ba)
    # compute output of the current cell using the formula given above
    yt_pred = softmax(np.dot(Wya,a_next)+by)   
    ### END CODE HERE ###

    # store values you need for backward propagation in cache
    cache = (a_next, a_prev, xt, parameters)

    return a_next, yt_pred, cache

np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)

它生成的输出是：

np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", a_next.shape)
print("yt_pred[1] =", yt_pred[1])
print("yt_pred.shape = ", yt_pred.shape)

输出：

a_next[4] = [ 0.59584544  0.18141802  0.61311866  0.99808218  0.85016201         0.99980978  -0.18887155  0.99815551  0.6531151  0.82872037]
a_next.shape = (5, 10)
yt_pred[1] = [ 0.9888161  0.01682021  0.21140899  0.36817467  0.98988387       0.88945212  0.36920224  0.9966312  0.9982559  0.17746526]
yt_pred.shape= (2, 10)

如您所见，我还使用了np.random.seed(1)，但结果却有所不同。我很困惑，是什么导致这种差异？

为什么交换轴会大大改变结果？

0 个答案: