TypeError:AdvancedBooleanSubtensor.grad非法返回一个整数值的变量。 (输入索引0,dtype int64)

时间:2018-07-14 13:21:29

标签: python theano

我确实在Theano中遇到了这个问题,主要的问题是Theano中的Scan()运算符,导致AdvancedBooleanSubtensor.grad非法返回了一个整数值的变量。如果有帮助,我将不胜感激。

这是我提取的代码。(如果disconnected_inputs设置为默认的“ raise”,将引发theano.gradient.DisconnectedInputError)。我的Theano版本是:1.0.1。

import numpy as np
from theano import tensor
import theano
from collections import OrderedDict

theano.config.floatX = "float32"
profile=False

def ortho_weight(ndim):
    W = np.random.randn(ndim, ndim)
    u, s, v = np.linalg.svd(W)  # Singular value decomposition for matrix W of ndim*ndim,W=US(V*H)
    return u.astype(theano.config.floatX)

def norm_weight(nin, nout=None, scale=0.01, ortho=True):
    if nout is None:
        nout = nin
    if nout == nin and ortho:
        W = ortho_weight(nin)  # Return a nin*nin size Unitary Matrix after singular value decomposition
    else:
        W = scale * np.random.randn(nin, nout)  # randomly generate a numpy matrix of nin*nout, scale all items inside
    return W.astype(theano.config.floatX)


def init_tparams(params):
    tparams = OrderedDict()
    for kk, pp in params.items():
    tparams[kk] = theano.shared(params[kk],name=kk)  # shared variables

    return tparams

def itemlist(tparams):
    return [vv for kk, vv in tparams.items()]

# params
params = OrderedDict()  # Create an ordered dictionary

context_mask = np.array([[1. ,1. ,1. ,1., 1.],
                         [1., 1., 1., 1., 1.],
                         [1., 1., 1., 1., 1.],
                         [1., 1., 1., 1., 1.],
                         [1., 1., 1., 1., 0.],
                         [1., 1., 0., 1., 0.],
                         [1., 0., 0., 0., 0.]]).astype(theano.config.floatX)  # nsteps_src = 7 , n_samples = 5
mask = np.array([[1. ,1. ,1. ,1., 1.],
                 [1., 1., 1., 1., 1.],
                 [1., 1., 1., 1., 1.],
                 [1., 1., 1., 1., 1.],
                 [1., 1., 1., 1., 0.],
                 [1., 1., 0., 1., 0.],
                 [1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # nsteps_trg = 7 , n_samples = 5

D = 1
numPositions = 2*D+1  # 3
nsteps_src = context_mask.shape[0]
nsteps_trg = mask.shape[0]
n_samples = context_mask.shape[1]
n_words = nsteps_trg*n_samples
dim_word = 6

params['Wemb_dec'] = norm_weight(n_words, dim_word)  # 35*6
emb = theano.shared(params['Wemb_dec'])  # shard variables
emb = emb.reshape([nsteps_trg, n_samples, dim_word])
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])  # Set the first timestps to 0, others no change
emb = emb_shifted

cc_=0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
dim = cc_.shape[2]  # dim = 4

context_mask = theano.shared(context_mask) # shard variables
context_mask1 = context_mask
mask = theano.shared(mask)
cc_ = theano.shared(cc_)

# These weight matrix and bias term are to be used for feedforward neural network,Respectively
params['ff_state_W'] = norm_weight(dim, dim, scale=0.01, ortho=True)  #
params['ff_state_b'] = np.zeros((dim,)).astype(theano.config.floatX)

params['ff_logit_lstm_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_lstm_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_prev_W'] = norm_weight(dim_word, dim_word, scale=0.01, ortho=False)
params['ff_logit_prev_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_ctx_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_ctx_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_W'] =  norm_weight(dim_word, n_words, scale=0.01, ortho=True)
params['ff_logit_b'] = np.zeros((n_words,)).astype(theano.config.floatX)

# These parameters will be used for Gated Recurrent Unit(GRU) in scan().
params['W'] = np.concatenate([norm_weight(dim_word, dim),norm_weight(dim_word, dim)], axis=1)  # dim_word*2dim
params['b'] = np.zeros((2 * dim,)).astype(theano.config.floatX)  # 2dim

params['U'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1)  # 2 svd matrices (size :dim*dim) connected by columns

params['Wx'] = norm_weight(dim_word, dim)  # dim_word*dim

params['Ux'] = ortho_weight(dim)  # svd matrices (size :dim*dim)
params['bx'] = np.zeros((dim,)).astype(theano.config.floatX)  # dim

params['U_nl'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['b_nl'] = np.zeros((2 * dim,)).astype(theano.config.floatX)  # 2dim

params['Ux_nl'] = ortho_weight(dim)  # svd matrices (size :dim*dim)
params['bx_nl'] = np.zeros((dim,)).astype(theano.config.floatX)  # dim

params['Wc'] = norm_weight(dim, dim*2)        # dim * 2dim
params['Wcx'] = norm_weight(dim, dim)         # svd matrices (size :dim*dim)

params['W_comb_att'] = norm_weight(dim, dim)  # svd matrices (size :dim*dim)
params['Wc_att'] = norm_weight(dim)           # svd matrices (size :dim*dim)

params['b_att'] = np.zeros((dim,)).astype(theano.config.floatX)  # dim

params['U_att'] = norm_weight(dim, 1)  # dim*1
params['c_tt'] = np.zeros((1,)).astype(theano.config.floatX)

tparams = init_tparams(params)

def _slice(_x, n, dim):  # slice() for tensor,get [:,n*dim:(n+1)*dim] of tensor
    if _x.ndim == 3:
        return _x[:, :, n * dim:(n + 1) * dim]
    return _x[:, n * dim:(n + 1) * dim]

ctx_mean = (cc_ * context_mask[:, :, None]).sum(0) / context_mask.sum(0)[:, None]
init_state = tensor.tanh(tensor.dot(ctx_mean,tparams['ff_state_W'])+tparams['ff_state_b'])  # Initialization of h_ in _step_slice of scan()

state_belowx = tensor.dot(emb, tparams['Wx']) +tparams['bx']   
# state_belowx = emb*tparams['Wx']+tparams['bx']

state_below_ = tensor.dot(emb, tparams['W']) + tparams['b']   
# state_below_ = emb*tparams['W']+tparams['b']

def _step_slice(m, m_, x_, xx_, h_, ctx_, alpha_,cc_,context_mask1
                Wc_att, b_att,U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
                U_nl, Ux_nl, b_nl, bx_nl): 
    preact1 = tensor.dot(h_, U)
    preact1 += x_
    preact1 = tensor.nnet.sigmoid(preact1)

    r1 = _slice(preact1, 0, dim)
    u1 = _slice(preact1, 1, dim)

    preactx1 = tensor.dot(h_, Ux)
    preactx1 *= r1
    preactx1 += xx_

    h1 = tensor.tanh(preactx1)

    h1 = u1 * h_ + (1. - u1) * h1
    h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_  # h1 is source hidden state ,batchsize*dim

    srclen = (context_mask1.sum(0,keepdims=True)- 1).T  # context_mask.sum(0,keepdims=True)-1 Corresponding to source sentence length of this batch
    pctx__ = tensor.tanh(tensor.dot(h1, W_comb_att))
    pctx__ = tensor.dot(pctx__, U_att) + c_tt          # batchsize*1
    pstate_ = srclen * tensor.nnet.sigmoid(pctx__) + 1  # +1:eos is considered
    srcPositions = tensor.floor(pstate_)                # batchsize*1
    srcPositions = tensor.cast(srcPositions, dtype='int32')  # srcPosition is index, so shoud cast to int,like 3.6-->3

    unmaskedId = tensor.flatnonzero(m)          # per timesteps ,take per row element of the source mask matrix  as m ,(batchsize-x)
    srcPositions = srcPositions[unmaskedId,:]   # (batchsize-x)*1
    srclen = srclen[unmaskedId,:]              # (batchsize-x)*1
    startAttnIds = srcPositions - D
    indicesAll = startAttnIds.repeat(numPositions, axis=1)
    indicesAll += tensor.mgrid[0:unmaskedId.shape[0], 0:numPositions][1]  # (batchsize-x)*numPositions
    indicesAll = indicesAll.T.flatten()  # 1*(numPositions*(batchsize-x))

    # Delete the source sentence index position centered on pstate_ and the window size D, which exceeds 0 or the maximum length of the sentence
    includeIds = (indicesAll <= tensor.tile(srclen, [numPositions, 1]).flatten()) & (indicesAll >= 0)
    indicesAll = indicesAll[includeIds]  # dimensional reduction,1*((numPosition*batchsize-x)-y)

    indicesSub = tensor.arange(0,numPositions).repeat(unmaskedId.shape[0])  # Scale-out numPositions times,1*(numPosition*batchsize-x)
    indicesSub = indicesSub[includeIds]  # 1*((numPosition*batchsize-x)-y)
    unmaskedIds = tensor.tile(unmaskedId,numPositions)  # Scale-out numPositions times,,1*(numPosition*batchsize-x)
    unmaskedIds = unmaskedIds[includeIds]  # 1*((numPosition*batchsize-x)-y)

    srcVecsSub = tensor.zeros([numPositions*n_samples,dim])   # 15*3
    linearIdSub = indicesSub*n_samples+ unmaskedIds           # 1* ((numPosition*batchsize-x)-y)
    linearIdAll = indicesAll*n_samples+ unmaskedIds          # 1* ((numPosition*batchsize-x)-y)
    cc_ = tensor.reshape(cc_,[nsteps_src * n_samples,dim])    # 35*3
    srcVecsSub = tensor.set_subtensor(srcVecsSub[linearIdSub, :],\ 
    cc_[linearIdAll, :])  # numPositions*n_samples*dim
    srcVecsSub = srcVecsSub.reshape([numPositions, n_samples, dim])

    e_ij = tensor.dot(srcVecsSub,Wc_att)+b_att  # equivalent to eq:W_a*hs+b_a,numPositions * batchSize*dim
    e_ij = (h1*e_ij).sum(2)                     # equivalent to eq:h_t' *(W_a*hs+b_a),numPositions * batchSize

    scaleX = (indicesAll - tensor.tile(pstate_[unmaskedId], (numPositions,1)).flatten())/(D/2)  
    # unmaskedIds.shape[0]/n_samples即((numPosition*batchsize-x)-y) /batchsize
    distWeights = tensor.zeros([numPositions,n_samples])
    distWeights = tensor.set_subtensor(distWeights[indicesSub,unmaskedIds],\
    scaleX)  # batchSize numPositions

    alpha = e_ij * tensor.exp(-0.5*tensor.square(distWeights))
    alpha = alpha - alpha.max(0)  # subtract max elements
    alpha = tensor.exp(alpha)  # numPositions * batchSize

    context_mask_ = tensor.zeros([numPositions, n_samples])
    context_mask_ = tensor.set_subtensor(context_mask_[indicesSub, unmaskedIds], 1.)
    if context_mask_:  # context_mask =x_mask,nsteps_src * batchSize,be truncated to numPositions * batchSize
        alpha = alpha * context_mask_
    alpha_sum = alpha.sum(0,keepdims=True)             # ∑eij
    alpha_sum = tensor.switch(alpha_sum,alpha_sum,1.)  # if alpua_sum = 0 ->1
    alpha = alpha / alpha_sum                          # (numPositions * batchSize),eij/∑eij

    # current context,(numPositions*batchSize*dim).sum(0) -->batchSize*dim
    ctx_ = (srcVecsSub * alpha[:, :, None]).sum(0)

    preact2 = tensor.dot(h1, U_nl) + b_nl
    preact2 += tensor.dot(ctx_, Wc)
    preact2 = tensor.nnet.sigmoid(preact2)

    r2 = _slice(preact2, 0, dim)
    u2 = _slice(preact2, 1, dim)

    preactx2 = tensor.dot(h1, Ux_nl) + bx_nl
    preactx2 *= r2
    preactx2 += tensor.dot(ctx_, Wcx)

    h2 = tensor.tanh(preactx2)

    h2 = u2 * h1 + (1. - u2) * h2
    h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1

    return h2, ctx_, alpha.T

seqs = [context_mask, mask, state_below_, state_belowx]

shared_vars = [tparams['Wc_att'],
               tparams['b_att'],
               tparams['U'],
               tparams['Wc'],
               tparams['W_comb_att'],
               tparams['U_att'],
               tparams['c_tt'],
               tparams['Ux'],
               tparams['Wcx'],
               tparams['U_nl'],
               tparams['Ux_nl'],
               tparams['b_nl'],
               tparams['bx_nl']]

rval, updates = theano.scan(_step_slice,
                            sequences=seqs,
                            outputs_info=[init_state,
                                          tensor.alloc(0., n_samples,
                                                       cc_.shape[2]),
                                          tensor.alloc(0., n_samples,
                                                       cc_.shape[0])],
                            non_sequences=[cc_,context_mask1]+shared_vars,
                            name='layers',
                            n_steps=nsteps_trg,
                            profile=profile,
                            strict=True)

proj_h = rval[0]
ctxs = rval[1]

# 3 feedforward neural networks,used to indicate the contribution of proj_h, emb, ctxs to cost
logit_lstm = tensor.dot(proj_h,tparams['ff_logit_lstm_W'])+\
tparams['ff_logit_lstm_b']

logit_prev = tensor.dot(emb,tparams['ff_logit_prev_W'])+\
tparams['ff_logit_prev_b']

logit_ctx =tensor.dot(ctxs,tparams['ff_logit_ctx_W'])+\
tparams['ff_logit_ctx_b']

logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)

# a feedforward neural network layer + softmax
logit = tensor.dot(logit,tparams['ff_logit_W'])+tparams['ff_logit_b']
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
                                               logit_shp[2]]))
# cost
cost = -tensor.log(probs.flatten())
cost = cost.reshape([mask.shape[0], mask.shape[1]])
cost = (cost * mask).sum(0)

# make mean(), tensor->scalar
cost = cost.mean()

print('Computing gradient...', end=" ")
grads = tensor.grad(cost, wrt=itemlist(tparams),disconnected_inputs = 'ignore')
print('Done')

0 个答案:

没有答案