我确实在Theano中遇到了这个问题,主要的问题是Theano中的Scan()运算符,导致AdvancedBooleanSubtensor.grad非法返回了一个整数值的变量。如果有帮助,我将不胜感激。
这是我提取的代码。(如果disconnected_inputs设置为默认的“ raise”,将引发theano.gradient.DisconnectedInputError)。我的Theano版本是:1.0.1。
import numpy as np
from theano import tensor
import theano
from collections import OrderedDict
theano.config.floatX = "float32"
profile=False
def ortho_weight(ndim):
W = np.random.randn(ndim, ndim)
u, s, v = np.linalg.svd(W) # Singular value decomposition for matrix W of ndim*ndim,W=US(V*H)
return u.astype(theano.config.floatX)
def norm_weight(nin, nout=None, scale=0.01, ortho=True):
if nout is None:
nout = nin
if nout == nin and ortho:
W = ortho_weight(nin) # Return a nin*nin size Unitary Matrix after singular value decomposition
else:
W = scale * np.random.randn(nin, nout) # randomly generate a numpy matrix of nin*nout, scale all items inside
return W.astype(theano.config.floatX)
def init_tparams(params):
tparams = OrderedDict()
for kk, pp in params.items():
tparams[kk] = theano.shared(params[kk],name=kk) # shared variables
return tparams
def itemlist(tparams):
return [vv for kk, vv in tparams.items()]
# params
params = OrderedDict() # Create an ordered dictionary
context_mask = np.array([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # nsteps_src = 7 , n_samples = 5
mask = np.array([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # nsteps_trg = 7 , n_samples = 5
D = 1
numPositions = 2*D+1 # 3
nsteps_src = context_mask.shape[0]
nsteps_trg = mask.shape[0]
n_samples = context_mask.shape[1]
n_words = nsteps_trg*n_samples
dim_word = 6
params['Wemb_dec'] = norm_weight(n_words, dim_word) # 35*6
emb = theano.shared(params['Wemb_dec']) # shard variables
emb = emb.reshape([nsteps_trg, n_samples, dim_word])
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) # Set the first timestps to 0, others no change
emb = emb_shifted
cc_=0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
dim = cc_.shape[2] # dim = 4
context_mask = theano.shared(context_mask) # shard variables
context_mask1 = context_mask
mask = theano.shared(mask)
cc_ = theano.shared(cc_)
# These weight matrix and bias term are to be used for feedforward neural network,Respectively
params['ff_state_W'] = norm_weight(dim, dim, scale=0.01, ortho=True) #
params['ff_state_b'] = np.zeros((dim,)).astype(theano.config.floatX)
params['ff_logit_lstm_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_lstm_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_prev_W'] = norm_weight(dim_word, dim_word, scale=0.01, ortho=False)
params['ff_logit_prev_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_ctx_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_ctx_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_W'] = norm_weight(dim_word, n_words, scale=0.01, ortho=True)
params['ff_logit_b'] = np.zeros((n_words,)).astype(theano.config.floatX)
# These parameters will be used for Gated Recurrent Unit(GRU) in scan().
params['W'] = np.concatenate([norm_weight(dim_word, dim),norm_weight(dim_word, dim)], axis=1) # dim_word*2dim
params['b'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['U'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['Wx'] = norm_weight(dim_word, dim) # dim_word*dim
params['Ux'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_nl'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['b_nl'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['Ux_nl'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx_nl'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['Wc'] = norm_weight(dim, dim*2) # dim * 2dim
params['Wcx'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['W_comb_att'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['Wc_att'] = norm_weight(dim) # svd matrices (size :dim*dim)
params['b_att'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_att'] = norm_weight(dim, 1) # dim*1
params['c_tt'] = np.zeros((1,)).astype(theano.config.floatX)
tparams = init_tparams(params)
def _slice(_x, n, dim): # slice() for tensor,get [:,n*dim:(n+1)*dim] of tensor
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
ctx_mean = (cc_ * context_mask[:, :, None]).sum(0) / context_mask.sum(0)[:, None]
init_state = tensor.tanh(tensor.dot(ctx_mean,tparams['ff_state_W'])+tparams['ff_state_b']) # Initialization of h_ in _step_slice of scan()
state_belowx = tensor.dot(emb, tparams['Wx']) +tparams['bx']
# state_belowx = emb*tparams['Wx']+tparams['bx']
state_below_ = tensor.dot(emb, tparams['W']) + tparams['b']
# state_below_ = emb*tparams['W']+tparams['b']
def _step_slice(m, m_, x_, xx_, h_, ctx_, alpha_,cc_,context_mask1
Wc_att, b_att,U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
U_nl, Ux_nl, b_nl, bx_nl):
preact1 = tensor.dot(h_, U)
preact1 += x_
preact1 = tensor.nnet.sigmoid(preact1)
r1 = _slice(preact1, 0, dim)
u1 = _slice(preact1, 1, dim)
preactx1 = tensor.dot(h_, Ux)
preactx1 *= r1
preactx1 += xx_
h1 = tensor.tanh(preactx1)
h1 = u1 * h_ + (1. - u1) * h1
h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ # h1 is source hidden state ,batchsize*dim
srclen = (context_mask1.sum(0,keepdims=True)- 1).T # context_mask.sum(0,keepdims=True)-1 Corresponding to source sentence length of this batch
pctx__ = tensor.tanh(tensor.dot(h1, W_comb_att))
pctx__ = tensor.dot(pctx__, U_att) + c_tt # batchsize*1
pstate_ = srclen * tensor.nnet.sigmoid(pctx__) + 1 # +1:eos is considered
srcPositions = tensor.floor(pstate_) # batchsize*1
srcPositions = tensor.cast(srcPositions, dtype='int32') # srcPosition is index, so shoud cast to int,like 3.6-->3
unmaskedId = tensor.flatnonzero(m) # per timesteps ,take per row element of the source mask matrix as m ,(batchsize-x)
srcPositions = srcPositions[unmaskedId,:] # (batchsize-x)*1
srclen = srclen[unmaskedId,:] # (batchsize-x)*1
startAttnIds = srcPositions - D
indicesAll = startAttnIds.repeat(numPositions, axis=1)
indicesAll += tensor.mgrid[0:unmaskedId.shape[0], 0:numPositions][1] # (batchsize-x)*numPositions
indicesAll = indicesAll.T.flatten() # 1*(numPositions*(batchsize-x))
# Delete the source sentence index position centered on pstate_ and the window size D, which exceeds 0 or the maximum length of the sentence
includeIds = (indicesAll <= tensor.tile(srclen, [numPositions, 1]).flatten()) & (indicesAll >= 0)
indicesAll = indicesAll[includeIds] # dimensional reduction,1*((numPosition*batchsize-x)-y)
indicesSub = tensor.arange(0,numPositions).repeat(unmaskedId.shape[0]) # Scale-out numPositions times,1*(numPosition*batchsize-x)
indicesSub = indicesSub[includeIds] # 1*((numPosition*batchsize-x)-y)
unmaskedIds = tensor.tile(unmaskedId,numPositions) # Scale-out numPositions times,,1*(numPosition*batchsize-x)
unmaskedIds = unmaskedIds[includeIds] # 1*((numPosition*batchsize-x)-y)
srcVecsSub = tensor.zeros([numPositions*n_samples,dim]) # 15*3
linearIdSub = indicesSub*n_samples+ unmaskedIds # 1* ((numPosition*batchsize-x)-y)
linearIdAll = indicesAll*n_samples+ unmaskedIds # 1* ((numPosition*batchsize-x)-y)
cc_ = tensor.reshape(cc_,[nsteps_src * n_samples,dim]) # 35*3
srcVecsSub = tensor.set_subtensor(srcVecsSub[linearIdSub, :],\
cc_[linearIdAll, :]) # numPositions*n_samples*dim
srcVecsSub = srcVecsSub.reshape([numPositions, n_samples, dim])
e_ij = tensor.dot(srcVecsSub,Wc_att)+b_att # equivalent to eq:W_a*hs+b_a,numPositions * batchSize*dim
e_ij = (h1*e_ij).sum(2) # equivalent to eq:h_t' *(W_a*hs+b_a),numPositions * batchSize
scaleX = (indicesAll - tensor.tile(pstate_[unmaskedId], (numPositions,1)).flatten())/(D/2)
# unmaskedIds.shape[0]/n_samples即((numPosition*batchsize-x)-y) /batchsize
distWeights = tensor.zeros([numPositions,n_samples])
distWeights = tensor.set_subtensor(distWeights[indicesSub,unmaskedIds],\
scaleX) # batchSize numPositions
alpha = e_ij * tensor.exp(-0.5*tensor.square(distWeights))
alpha = alpha - alpha.max(0) # subtract max elements
alpha = tensor.exp(alpha) # numPositions * batchSize
context_mask_ = tensor.zeros([numPositions, n_samples])
context_mask_ = tensor.set_subtensor(context_mask_[indicesSub, unmaskedIds], 1.)
if context_mask_: # context_mask =x_mask,nsteps_src * batchSize,be truncated to numPositions * batchSize
alpha = alpha * context_mask_
alpha_sum = alpha.sum(0,keepdims=True) # ∑eij
alpha_sum = tensor.switch(alpha_sum,alpha_sum,1.) # if alpua_sum = 0 ->1
alpha = alpha / alpha_sum # (numPositions * batchSize),eij/∑eij
# current context,(numPositions*batchSize*dim).sum(0) -->batchSize*dim
ctx_ = (srcVecsSub * alpha[:, :, None]).sum(0)
preact2 = tensor.dot(h1, U_nl) + b_nl
preact2 += tensor.dot(ctx_, Wc)
preact2 = tensor.nnet.sigmoid(preact2)
r2 = _slice(preact2, 0, dim)
u2 = _slice(preact2, 1, dim)
preactx2 = tensor.dot(h1, Ux_nl) + bx_nl
preactx2 *= r2
preactx2 += tensor.dot(ctx_, Wcx)
h2 = tensor.tanh(preactx2)
h2 = u2 * h1 + (1. - u2) * h2
h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
return h2, ctx_, alpha.T
seqs = [context_mask, mask, state_below_, state_belowx]
shared_vars = [tparams['Wc_att'],
tparams['b_att'],
tparams['U'],
tparams['Wc'],
tparams['W_comb_att'],
tparams['U_att'],
tparams['c_tt'],
tparams['Ux'],
tparams['Wcx'],
tparams['U_nl'],
tparams['Ux_nl'],
tparams['b_nl'],
tparams['bx_nl']]
rval, updates = theano.scan(_step_slice,
sequences=seqs,
outputs_info=[init_state,
tensor.alloc(0., n_samples,
cc_.shape[2]),
tensor.alloc(0., n_samples,
cc_.shape[0])],
non_sequences=[cc_,context_mask1]+shared_vars,
name='layers',
n_steps=nsteps_trg,
profile=profile,
strict=True)
proj_h = rval[0]
ctxs = rval[1]
# 3 feedforward neural networks,used to indicate the contribution of proj_h, emb, ctxs to cost
logit_lstm = tensor.dot(proj_h,tparams['ff_logit_lstm_W'])+\
tparams['ff_logit_lstm_b']
logit_prev = tensor.dot(emb,tparams['ff_logit_prev_W'])+\
tparams['ff_logit_prev_b']
logit_ctx =tensor.dot(ctxs,tparams['ff_logit_ctx_W'])+\
tparams['ff_logit_ctx_b']
logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
# a feedforward neural network layer + softmax
logit = tensor.dot(logit,tparams['ff_logit_W'])+tparams['ff_logit_b']
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
logit_shp[2]]))
# cost
cost = -tensor.log(probs.flatten())
cost = cost.reshape([mask.shape[0], mask.shape[1]])
cost = (cost * mask).sum(0)
# make mean(), tensor->scalar
cost = cost.mean()
print('Computing gradient...', end=" ")
grads = tensor.grad(cost, wrt=itemlist(tparams),disconnected_inputs = 'ignore')
print('Done')