我设计了一个变量网,但是它出现了一些问题。一般的想法是不同的输入将获得具有相同参数的不同网络,例如具有自动编码器的递归神经网络。
我的代码中有两种情况,如果combine_feat_gt1_1()
则运行一个案例c > 1
,另一种案例运行combine_feat_gt1_0()
。
奇怪的是,如果我评论updates = updates,代码可以无错误地运行,这不是我期望的(代码中的train_test theano函数)。但是,如果我取消注释updates = updates,则会发生错误(代码中的train_test_bug theano函数)。后者是我想实施的。
我已经花了几天时间来处理这个bug。谁能帮我?我会很感激的。
import os
import sys
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from theano.ifelse import ifelse
class Test(object):
def __init__(
self,
numpy_rng,
input=None,
output=None,
n_output=6,
n_input=3,
n_group=2,
W_r=None,
b_r=None
):
self.n_output = n_output
self.n_input = n_input
self.n_group = n_group
if not W_r:
initial_W_r = numpy.asarray(
numpy_rng.uniform(
low=-4 * numpy.sqrt(6. / (n_input + n_input)),
high=4 * numpy.sqrt(6. / (n_input + n_input)),
size=(n_input, n_input)
),
dtype=theano.config.floatX
)
W_r = theano.shared(value=initial_W_r, name='W_r', borrow=True)
if not b_r:
b_r = theano.shared(
value=numpy.zeros(
n_input,
dtype=theano.config.floatX
),
borrow=True
)
self.W_r = W_r
self.b_r = b_r
if input is None:
self.x = T.tensor4(name='input', dtype=theano.config.floatX)
else:
self.x = input
if output is None:
self.y = T.matrix(name='output', dtype=theano.config.floatX)
else:
self.y = output
self.params = [self.W_r, self.b_r]
def get_output_values(self, input):
a, b, c, d = input.shape
def recusive(x_t, h_tm1, wr, hr):
h_t = T.dot(h_tm1, wr) + T.dot(x_t, wr) + hr
return h_t
def combine_recusive(data):
hidden, _ = theano.scan(fn=recusive,
sequences=data[1:],
outputs_info=data[0],
non_sequences=[self.W_r, self.b_r],
n_steps=data[1:].shape[0],
strict=True)
return hidden[-1]
def combine_feat_gt1_1(input):
feats, _ = theano.scan(fn=combine_recusive,
sequences=input[0],
outputs_info=None,
n_steps=input[0].shape[0])
recusive_flag = T.ones(1)
return T.reshape(feats, (1,-1)) # concatenation
def combine_feat_gt1_0(input):
feats = input[0]
recusive_flag = T.zeros(1)
return T.reshape(feats, (1,-1)) # concatenation
feat = ifelse(T.gt(c, 1), combine_feat_gt1_1(input), combine_feat_gt1_0(input))
# debug code snippet
self.debug_ifelse = theano.function([input], T.gt(c, 1))
self.debug_1_0 = theano.function([input], ifelse(T.gt(c, 1), 1, 0))
return feat
def get_cost_updates(self):
learning_rate = 0.1
self.y_given_x = self.get_output_values(self.x)
cost = T.sum(( self.y_given_x - self.y) ** 2)
gparams = T.grad(cost, self.params)
updates = [
(param, param - learning_rate * gparam)
for param, gparam in zip(self.params, gparams)
]
return (cost, updates)
if __name__ == "__main__":
toy_data = numpy.array([[[[1,1,1],[2,2,2]], [[3, 4,5],[4,5,6]]]],dtype=theano.config.floatX)
lable = numpy.array([[1,2,3,4,5,6]],dtype=theano.config.floatX)
toy_data2 = numpy.array([[[[1,1,1]], [[3,4,5]]]],dtype=theano.config.floatX)
lable2 = numpy.array([[6,5,4,3,2,1]],dtype=theano.config.floatX)
x = T.tensor4('x', dtype=theano.config.floatX)
y = T.matrix('y', dtype=theano.config.floatX)
newX = T.tensor4(dtype=x.dtype)
newY = T.matrix(dtype=y.dtype)
rng = numpy.random.RandomState(123)
test = Test(
numpy_rng=rng,
input=x,
output=y,
n_group=2,
n_input=3,
n_output=6
)
cost, updates= test.get_cost_updates()
train_test = theano.function(
[newX, newY],
cost,
# updates=updates,
givens={
x : newX,
y : newY
}
)
train_test_bug = theano.function(
[newX, newY],
cost,
updates=updates,
givens={
x : newX,
y : newY
}
)
print train_test(toy_data, lable)
print train_test(toy_data2, lable2)
# code with bug
# print train_test_bug(toy_data, lable)
# print train_test_bug(toy_data2, lable2)
编辑(@danielrenshaw)
我已将代码缩减为更简单的问题演示。
原因在于双嵌套扫描表达式的梯度计算。当使用修改后的最内部递归表达式时,问题消失(请参阅下面第一个函数中的注释)。
import numpy
import theano
import theano.tensor as tt
import theano.ifelse
def inner_scan_step(x_t_t, h_tm1, w):
# Fails when using this recursive expression
h_t = tt.dot(h_tm1, w) + x_t_t
# No failure when using this recursive expression
# h_t = h_tm1 + tt.dot(x_t_t, w)
return h_t
def outer_scan_step(x_t, w):
h, _ = theano.scan(inner_scan_step,
sequences=[x_t[1:]],
outputs_info=[x_t[0]],
non_sequences=[w],
strict=True)
return h[-1]
def get_outputs(x, w):
features, _ = theano.scan(outer_scan_step,
sequences=[x],
non_sequences=[w],
strict=True)
return tt.grad(features.sum(), w)
def main():
theano.config.compute_test_value = 'raise'
x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))
x = tt.tensor3()
x.tag.test_value = x_value
w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
f = theano.function(inputs=[x], outputs=get_outputs(x, w))
print f(x_value)
if __name__ == "__main__":
main()
答案 0 :(得分:1)
我解决了danielrenshaw编辑的这个问题。当我将h0添加为outputs_info时,它可以工作。在此之前我使用序列的第一个元素作为outputs_info,我认为它导致了错误。但我仍然无法解决我原来的问题。
import numpy
import theano
import theano.tensor as tt
import theano.ifelse
def inner_scan_step(x_t_t, h_tm1, w):
# Fails when using this recursive expression
h_t = tt.dot(h_tm1, w) + x_t_t
# No failure when using this recursive expression
# h_t = h_tm1 + tt.dot(x_t_t, w)
return h_t
def outer_scan_step(x_t, w, h0):
h, _ = theano.scan(inner_scan_step,
sequences=[x_t],
outputs_info=[h0],
non_sequences=[w],
strict=True)
return h[-1]
def get_outputs(x, w, h0):
features, _ = theano.scan(outer_scan_step,
sequences=[x],
non_sequences=[w, h0],
strict=True)
return tt.grad(features.sum(), w)
def main():
theano.config.compute_test_value = 'raise'
x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))
x = tt.tensor3()
x.tag.test_value = x_value
w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
h0 = theano.shared(value=numpy.zeros(3, dtype=theano.config.floatX), borrow=True)
f = theano.function(inputs=[x], outputs=get_outputs(x, w, h0))
print f(x_value)
if __name__ == "__main__":
main()
答案 1 :(得分:1)
我遇到了同样的问题,我通过在theano_flags中放置optimizer = fast_compile来修复它。猜猜这是theano的错误。