无法理解Theano如何在RNN NLP中进行分类

时间:2015-07-29 12:17:27

标签: python machine-learning nlp theano deep-learning

import os
import theano, numpy
from theano import tensor as T
from collections import OrderedDict

class RNNSLU(object):
 """ Elman neural net"""

 def __init__(self, nh, nc, ne, de, cs):
"""
Hyperparameters used for initialization
nh : dimension of the hidden layer
nc : number of classes (labels)
ne : size of vocabulary
de : dimension of embedding
cs : word context window size
"""
Parameter to be learnt : word embeddings
self.embeddings = theano.shared(name='embeddings',
    value = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de))
    .astype(theano.config.floatX))

# Parameter to be learnt : Weight matrix mapping input to the hidden layer (de*cs x nh)
self.wx = theano.shared(name='wx',
    value = 0.2 * numpy.random.uniform(-1.0, 1.0, (de * cs, nh))
    .astype(theano.config.floatX))

# Parameter to be learnt : Weight matrix mapping hidden layer from the
# previous time step to the current one
self.wh = theano.shared(name='wh',
    value = 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nh))
    .astype(theano.config.floatX))

# Parameter to be learnt : Weight matrix mapping hidden to output layer (nh x nc)
self.w = theano.shared(name='w',
    value = 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nc))
    .astype(theano.config.floatX))

# Parameter to be learnt : Bias at the hidden layer
self.bh = theano.shared(name='bh',
    value = numpy.zeros(nh,
      dtype=theano.config.floatX))

# Parameter to be learnt : The bias of the output layer
self.b = theano.shared(name='b',
    value = numpy.zeros(nc,
      dtype=theano.config.floatX))

# Parameter to be learnt : The hidden layer at time t=0
self.h0 = theano.shared(name='h0',
    value = numpy.zeros(nh,
      dtype=theano.config.floatX))

# Bundle the parameters
self.params = [self.embeddings, self.wx, self.wh, self.w, self.bh, self.b, self.h0]
self.names  = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 'h0']

#Compile training function
self.prepare_train(de, cs)

def prepare_train(self, de, cs):
"""
Trains the recurrent neural net
"""
idxs = T.imatrix() # columns = no of words in window, rows = len of sentence
# Prepare to recieve input and output labels
x = self.embeddings[idxs].reshape((idxs.shape[0], de*cs))
y = T.iscalar('y')

 def recurrence(x_t, h_tm1):
  """
  x_t : Input at time t
  h_tm1 : Hidden state at time t-1
  """
  # Compute the hidden state at time time
  # h_t = g(x_t . w_x + h_tm1 . w_h + b_h)

  h_t = T.nnet.sigmoid(T.dot(x_t, self.wx) + T.dot(h_tm1, self.wh) + self.bh)
  # Compute the output layer
  # s_t = g(h_t . w + b)
  s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
  return [h_t, s_t]

[h,s], _ = theano.scan(fn=recurrence,
    sequences=x,
    outputs_info=[self.h0, None],
    n_steps=x.shape[0])

#print h.ndim
#print s.ndim

# TODO: What is the structure of s? What does the selection of axis do ?
p_y_given_sentence = s[:,0,:]
y_pred = T.argmax(p_y_given_sentence, axis=1)

# Learning rate
lr = T.scalar('lr')
# Sentence negative log-likelihood (The objective function)
sentence_nll = - T.mean(T.log(p_y_given_sentence)[T.arange(x.shape[0]), y])
# Compute paramter wise gradients
sentence_gradients = T.grad(sentence_nll, self.params)
# Compute updats
sentence_updates = OrderedDict((p, p - lr*g) for p,g in zip(self.params, sentence_gradients))

# Compile functions
self.classify = theano.function(inputs=[idxs], outputs=y_pred)
self.sentence_train = theano.function(inputs=[idxs, y, lr], outputs=sentence_nll, updates=sentence_updates)



#### Main Function from which we are calling class
rnn = RNNSLU(nh=s['nhidden'], nc=nClasses, ne=vocSize, de=s['emb_dimension'], cs=s['win'])

for word_batch, label_last_word in zip(words, labels):
  rnn.sentence_train(word_batch, label_last_word, s['clr'])
  rnn.normalize()

代码说明:

我知道在stackoverflow中这不是一件好事。但我正在努力解码这个用于训练递归神经网络的代码超过一周。我首先是theano的新手。

word_batch = array([[-1,-1,-1,194,358,463,208]],dtype = int32) label_last_word = 126

wordwatch是一个如下句子的索引:

'我要从英国去美国

这里word_batch是与一个特定单词相关联的上下文窗口,例如USA。因此,如果上下文窗口为7,则批处理单词中的中间(194)表示数据集中该单词的索引。我想知道,当我把这个作为rnn.sentence_train的论据传递时,RNNSLU课程中如何进行训练。我对该类中的idx,x等变量的使用感到困惑。我知道这在理论上是如何发生的,但无法明确解码theano部分。如果我的问题没有意义,请告诉我。

感谢。

1 个答案:

答案 0 :(得分:1)

rnn.sentence_train是具有updates=sentence_updates的Theano函数。这意味着在每次调用rnn.sentence_train时,sentence_updates字典键中的所有共享变量都将根据相应sentence_updates字典值中的符号更新表达式进行更新。这些表达式都是经典的梯度下降(当前参数值 - 学习率*成本相对于参数的梯度)。

idxs是培训功能输入的符号占位符。在您的示例中,word_batch在调用训练函数时填充该占位符。