Question

我设计了一个变量网，但是它出现了一些问题。一般的想法是不同的输入将获得具有相同参数的不同网络，例如具有自动编码器的递归神经网络。我的代码中有两种情况，如果combine_feat_gt1_1()则运行一个案例c > 1，另一种案例运行combine_feat_gt1_0()。

奇怪的是，如果我评论updates = updates，代码可以无错误地运行，这不是我期望的（代码中的train_test theano函数）。但是，如果我取消注释updates = updates，则会发生错误（代码中的train_test_bug theano函数）。后者是我想实施的。

我已经花了几天时间来处理这个bug。谁能帮我？我会很感激的。

import os
import sys
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from theano.ifelse import ifelse

class Test(object):

    def __init__(
        self,
        numpy_rng,
        input=None,
        output=None,
        n_output=6,
        n_input=3,
        n_group=2,
        W_r=None,
        b_r=None
    ):

        self.n_output = n_output
        self.n_input = n_input
        self.n_group = n_group

        if not W_r:
            initial_W_r = numpy.asarray(
                numpy_rng.uniform(
                    low=-4 * numpy.sqrt(6. / (n_input + n_input)),
                    high=4 * numpy.sqrt(6. / (n_input + n_input)),
                    size=(n_input, n_input)
                ),
                dtype=theano.config.floatX
            )
            W_r = theano.shared(value=initial_W_r, name='W_r', borrow=True)

        if not b_r:
            b_r = theano.shared(
                value=numpy.zeros(
                    n_input,
                    dtype=theano.config.floatX
                ),
                borrow=True
            )

        self.W_r = W_r
        self.b_r = b_r

        if input is None:
            self.x = T.tensor4(name='input', dtype=theano.config.floatX)
        else:
            self.x = input

        if output is None:
            self.y = T.matrix(name='output', dtype=theano.config.floatX)
        else:
            self.y = output

        self.params = [self.W_r, self.b_r]

    def get_output_values(self, input):
        a, b, c, d = input.shape

        def recusive(x_t, h_tm1, wr, hr):
            h_t = T.dot(h_tm1, wr) + T.dot(x_t, wr) +  hr
            return h_t

        def combine_recusive(data):
            hidden, _ = theano.scan(fn=recusive,
                               sequences=data[1:],
                               outputs_info=data[0],
                               non_sequences=[self.W_r, self.b_r],
                               n_steps=data[1:].shape[0],
                               strict=True)
            return hidden[-1]

        def combine_feat_gt1_1(input):
            feats, _ = theano.scan(fn=combine_recusive,
                                   sequences=input[0],
                                   outputs_info=None,
                                   n_steps=input[0].shape[0])
            recusive_flag = T.ones(1)
            return T.reshape(feats, (1,-1)) # concatenation

        def combine_feat_gt1_0(input):
            feats = input[0]
            recusive_flag = T.zeros(1)
            return T.reshape(feats, (1,-1)) # concatenation

        feat = ifelse(T.gt(c, 1), combine_feat_gt1_1(input), combine_feat_gt1_0(input))

        # debug code snippet
        self.debug_ifelse = theano.function([input], T.gt(c, 1))
        self.debug_1_0 = theano.function([input], ifelse(T.gt(c, 1), 1, 0))

        return feat

    def get_cost_updates(self):

        learning_rate = 0.1
        self.y_given_x = self.get_output_values(self.x)
        cost = T.sum(( self.y_given_x - self.y) ** 2)

        gparams = T.grad(cost, self.params)
        updates = [
             (param, param - learning_rate * gparam)
             for param, gparam in zip(self.params, gparams)
         ]

        return (cost, updates)


if __name__ == "__main__":

    toy_data = numpy.array([[[[1,1,1],[2,2,2]], [[3, 4,5],[4,5,6]]]],dtype=theano.config.floatX)
    lable = numpy.array([[1,2,3,4,5,6]],dtype=theano.config.floatX)
    toy_data2 = numpy.array([[[[1,1,1]], [[3,4,5]]]],dtype=theano.config.floatX)
    lable2 = numpy.array([[6,5,4,3,2,1]],dtype=theano.config.floatX)

    x = T.tensor4('x', dtype=theano.config.floatX)
    y = T.matrix('y', dtype=theano.config.floatX)
    newX = T.tensor4(dtype=x.dtype)
    newY = T.matrix(dtype=y.dtype)

    rng = numpy.random.RandomState(123)
    test = Test(
        numpy_rng=rng,
        input=x,
        output=y,
        n_group=2,
        n_input=3,
        n_output=6
    )

    cost, updates= test.get_cost_updates()

    train_test = theano.function(
        [newX, newY],
        cost,
        # updates=updates,
        givens={
            x : newX,
            y : newY
        }
    )

    train_test_bug = theano.function(
        [newX, newY],
        cost,
        updates=updates,
        givens={
            x : newX,
            y : newY
        }
    )


    print train_test(toy_data, lable)
    print train_test(toy_data2, lable2)

    # code with bug
    # print train_test_bug(toy_data, lable)
    # print train_test_bug(toy_data2, lable2)

编辑（@danielrenshaw）

我已将代码缩减为更简单的问题演示。

原因在于双嵌套扫描表达式的梯度计算。当使用修改后的最内部递归表达式时，问题消失（请参阅下面第一个函数中的注释）。

import numpy
import theano
import theano.tensor as tt
import theano.ifelse


def inner_scan_step(x_t_t, h_tm1, w):
    # Fails when using this recursive expression
    h_t = tt.dot(h_tm1, w) + x_t_t

    # No failure when using this recursive expression
    # h_t = h_tm1 + tt.dot(x_t_t, w)

    return h_t


def outer_scan_step(x_t, w):
    h, _ = theano.scan(inner_scan_step,
                       sequences=[x_t[1:]],
                       outputs_info=[x_t[0]],
                       non_sequences=[w],
                       strict=True)
    return h[-1]


def get_outputs(x, w):
    features, _ = theano.scan(outer_scan_step,
                              sequences=[x],
                              non_sequences=[w],
                              strict=True)
    return tt.grad(features.sum(), w)


def main():
    theano.config.compute_test_value = 'raise'

    x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))

    x = tt.tensor3()
    x.tag.test_value = x_value

    w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)

    f = theano.function(inputs=[x], outputs=get_outputs(x, w))

    print f(x_value)


if __name__ == "__main__":
    main()

Answer 1

我解决了danielrenshaw编辑的这个问题。当我将h0添加为outputs_info时，它可以工作。在此之前我使用序列的第一个元素作为outputs_info，我认为它导致了错误。但我仍然无法解决我原来的问题。

import numpy
import theano
import theano.tensor as tt
import theano.ifelse


def inner_scan_step(x_t_t, h_tm1, w):
    # Fails when using this recursive expression
    h_t = tt.dot(h_tm1, w) + x_t_t

    # No failure when using this recursive expression
    # h_t = h_tm1 + tt.dot(x_t_t, w)

    return h_t


def outer_scan_step(x_t, w, h0):
    h, _ = theano.scan(inner_scan_step,
                       sequences=[x_t],
                       outputs_info=[h0],
                       non_sequences=[w],
                       strict=True)
    return h[-1]


def get_outputs(x, w, h0):
    features, _ = theano.scan(outer_scan_step,
                              sequences=[x],
                              non_sequences=[w, h0],
                              strict=True)
    return tt.grad(features.sum(), w)


def main():
    theano.config.compute_test_value = 'raise'

    x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))

    x = tt.tensor3()
    x.tag.test_value = x_value

    w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
    h0 = theano.shared(value=numpy.zeros(3, dtype=theano.config.floatX), borrow=True)

    f = theano.function(inputs=[x], outputs=get_outputs(x, w, h0))

    print f(x_value)


if __name__ == "__main__":
    main()

Answer 2

我遇到了同样的问题，我通过在theano_flags中放置optimizer = fast_compile来修复它。猜猜这是theano的错误。

theano

2 个答案: