Question

我尝试从TensorFlow复制LSTMCell生成的结果，以确保我知道它的作用。

这是我的TensorFlow代码：

num_units = 3
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)

timesteps = 7
num_input = 4
X = tf.placeholder("float", [None, timesteps, num_input])
x = tf.unstack(X, timesteps, 1)
outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

x_val = np.random.normal(size = (1, 7, num_input))

res = sess.run(outputs, feed_dict = {X:x_val})

for e in res:
    print e

这是它的输出：

[[-0.13285545 -0.13569424 -0.23993783]]
[[-0.04818152  0.05927373  0.2558436 ]]
[[-0.13818116 -0.13837864 -0.15348436]]
[[-0.232219    0.08512601  0.05254192]]
[[-0.20371495 -0.14795329 -0.2261929 ]]
[[-0.10371902 -0.0263292  -0.0914975 ]]
[[0.00286371 0.16377522 0.059478  ]]

这是我自己的实现：

n_steps, _ = X.shape
h = np.zeros(shape = self.hid_dim)
c = np.zeros(shape = self.hid_dim)

for i in range(n_steps):
    x = X[i,:]

    vec = np.concatenate([x, h])
    #vec = np.concatenate([h, x])
    gs = np.dot(vec, self.kernel) + self.bias


    g1 = gs[0*self.hid_dim : 1*self.hid_dim]
    g2 = gs[1*self.hid_dim : 2*self.hid_dim]
    g3 = gs[2*self.hid_dim : 3*self.hid_dim]
    g4 = gs[3*self.hid_dim : 4*self.hid_dim]

    I = vsigmoid(g1)
    N = np.tanh(g2)
    F = vsigmoid(g3)
    O = vsigmoid(g4)

    c = c*F + I*N

    h = O * np.tanh(c)

    print h

这是它的输出：

[-0.13285543 -0.13569425 -0.23993781]
[-0.01461723  0.08060743  0.30876374]
[-0.13142865 -0.14921292 -0.16898363]
[-0.09892188  0.11739943  0.08772941]
[-0.15569218 -0.15165766 -0.21918869]
[-0.0480604  -0.00918626 -0.06084118]
[0.0963612  0.1876516  0.11888081]

您可能会注意到，我能够重现第一个隐藏的向量，但是第二个和随后的所有向量都是不同的。我想念什么？

Answer 1

Tensorflow使用glorot_uniform（）函数初始化lstm内核，该内核从随机均匀分布中采样权重。我们需要为内核修复一个值以获得可重复的结果：

import tensorflow as tf
import numpy as np

np.random.seed(0)
timesteps = 7
num_input = 4
x_val = np.random.normal(size = (1, timesteps, num_input))

num_units = 3

def glorot_uniform(shape):
    limit = np.sqrt(6.0 / (shape[0] + shape[1]))
    return np.random.uniform(low=-limit, high=limit, size=shape)

kernel_init = glorot_uniform((num_input + num_units, 4 * num_units))

我对LSTMCell的实现（嗯，实际上只是稍微重写了tensorflow的代码）：

def sigmoid(x):
    return 1. / (1 + np.exp(-x))

class LSTMCell():
    """Long short-term memory unit (LSTM) recurrent network cell.
    """
    def __init__(self, num_units, initializer=glorot_uniform,
               forget_bias=1.0, activation=np.tanh):
        """Initialize the parameters for an LSTM cell.
        Args:
          num_units: int, The number of units in the LSTM cell.
          initializer: The initializer to use for the kernel matrix. Default: glorot_uniform
          forget_bias: Biases of the forget gate are initialized by default to 1
            in order to reduce the scale of forgetting at the beginning of
            the training. 
          activation: Activation function of the inner states.  Default: np.tanh.
        """
        # Inputs must be 2-dimensional.
        self._num_units = num_units
        self._forget_bias = forget_bias
        self._activation = activation
        self._initializer = initializer

    def build(self, inputs_shape):
        input_depth = inputs_shape[-1]
        h_depth = self._num_units
        self._kernel = self._initializer(shape=(input_depth + h_depth, 4 * self._num_units))
        self._bias = np.zeros(shape=(4 * self._num_units))

    def call(self, inputs, state):
        """Run one step of LSTM.
        Args:
          inputs: input numpy array, must be 2-D, `[batch, input_size]`.
          state:  a tuple of numpy arrays, both `2-D`, with column sizes `c_state` and
            `m_state`.
        Returns:
          A tuple containing:
          - A `2-D, [batch, output_dim]`, numpy array representing the output of the
            LSTM after reading `inputs` when previous state was `state`.
            Here output_dim is equal to num_units.
          - Numpy array(s) representing the new state of LSTM after reading `inputs` when
            the previous state was `state`.  Same type and shape(s) as `state`.
        """
        num_proj = self._num_units
        (c_prev, m_prev) = state

        input_size = inputs.shape[-1]

        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
        lstm_matrix = np.hstack([inputs, m_prev]).dot(self._kernel)
        lstm_matrix += self._bias

        i, j, f, o = np.split(lstm_matrix, indices_or_sections=4, axis=0)
        # Diagonal connections
        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
               self._activation(j))

        m = sigmoid(o) * self._activation(c)

        new_state = (c, m)
        return m, new_state

X = x_val.reshape(x_val.shape[1:])

cell = LSTMCell(num_units, initializer=lambda shape: kernel_init)
cell.build(X.shape)

state = (np.zeros(num_units), np.zeros(num_units))
for i in range(timesteps):
    x = X[i,:]
    output, state = cell.call(x, state)
    print(output)

产生输出：

[-0.21386017 -0.08401277 -0.25431477]
[-0.22243588 -0.25817422 -0.1612211 ]
[-0.2282134  -0.14207162 -0.35017249]
[-0.23286737 -0.17129192 -0.2706512 ]
[-0.11768674 -0.20717363 -0.13339118]
[-0.0599215  -0.17756104 -0.2028935 ]
[ 0.11437953 -0.19484555  0.05371994]

在Tensorflow代码中，如果将第二行替换为

lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units, initializer = tf.constant_initializer(kernel_init))

返回：

[[-0.2138602  -0.08401276 -0.25431478]]
[[-0.22243595 -0.25817424 -0.16122109]]
[[-0.22821338 -0.1420716  -0.35017252]]
[[-0.23286738 -0.1712919  -0.27065122]]
[[-0.1176867  -0.2071736  -0.13339119]]
[[-0.05992149 -0.177561   -0.2028935 ]]
[[ 0.11437953 -0.19484554  0.05371996]]

Answer 2

我检查了this链接，并且您的代码几乎是完美的，但是您忘记在此行F = vsigmoid(g3)中实际添加了F = vsigmoid(g3+self.forget_bias)或在您的情况下为其添加了1 { {1}}

这是我与numpy一起使用的小偷：

F = vsigmoid(g3+1)

输出：

import numpy as np
import tensorflow as tf

num_units = 3
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)
batch=1
timesteps = 7
num_input = 4
X = tf.placeholder("float", [batch, timesteps, num_input])
x = tf.unstack(X, timesteps, 1)
outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
x_val = np.reshape(range(28),[batch, timesteps, num_input])
res = sess.run(outputs, feed_dict = {X:x_val})
for e in res:
    print(e)
print("\nmy imp\n")
#my impl
def sigmoid(x):
    return 1/(1+np.exp(-x))

kernel,bias=sess.run([lstm._kernel,lstm._bias])
f_b_=lstm._forget_bias
c,h=np.zeros([batch,num_input-1]),np.zeros([batch,num_input-1])
for step in range(timesteps):
    inpt=np.split(x_val,7,1)[step][0]
    lstm_mtrx=np.matmul(np.concatenate([inpt,h],1),kernel)+bias
    i,j,f,o=np.split(lstm_mtrx,4,1)
    c=sigmoid(f+f_b_)*c+sigmoid(i)*np.tanh(j)
    h=sigmoid(o)*np.tanh(c)
    print(h)

Answer 3

考虑到线性代数，可能会在I * N（红色圆圈）之间的矩阵乘法中存在尺寸不匹配，从而影响输出，因为n x m点m x p将为您提供{{ 1}}尺寸输出。

Answer 4

这里是blog，它将回答与LSTM有关的任何概念性问题。似乎有一个lot可以从头开始构建LSTM！

当然，这个答案并不能解决您的问题，而只是提供指导。

TensorFlow的LSTMCell到底如何工作？

4 个答案: