编码器-解码器模型无法学习

时间:2019-04-20 19:36:07

标签: python pytorch

在使用编码器-解码器模型创建的聊天机器人中,即使仅在一个数据点上进行训练,平均训练误差(交叉熵)也无法降低。该模型的所有数据均来自康奈尔电影对话语料库。为了对单词进行编码,将为每个单词分配一个数字,并将其用作稀疏张量的索引,该张量将传递给模型。

训练循环:

for epochcounter in range(10000):
    print("=======================EPOCH "+str(epochcounter)+"==================================")
    for k, batch in enumerate(iterate_batches(data)):
        batchid+=1
        averageloss = 0
        losscounter = 0
        for sample, label in batch:
            sample_t = []
            label_t = []
            embedded_label = []
            label = ['begline']+label
            for rawword in sample:
                rawword = rawword.lower()
                word = "".join(list(filter(lambda x: x in "qwertyuiopasdfghjklzxcvbnm",list(rawword))))
                i = vocab.getIndex(word)
                sample_t.append(torch.sparse_coo_tensor(torch.tensor([[i]]),torch.tensor([1]),[4453]))
            for rawword in label:
                rawword = rawword.lower()
                word = "".join(list(filter(lambda x: x in "qwertyuiopasdfghjklzxcvbnm",list(rawword))))
                i = vocab.getIndex(word)
                embedded_label.append(i)
                embedded_label.append(vocab.getIndex("endline"))
                label_t.append(torch.sparse_coo_tensor(torch.tensor([[i]]),torch.tensor([1]),[4453]))

            out_t = net(sample_t, label_t)
            for n, out in enumerate(out_t):
                loss_t = loss_function(out.view(1,4453),torch.tensor([embedded_label[n]]).view(1))
                loss_t.backward(retain_graph=True)
                loss = loss_t.item()
                averageloss = (losscounter*averageloss+loss)/(losscounter+1)
                losscounter+=1
        print("Epoch "+str(epochcounter)+", Batch "+str(k)+", Average Loss: "+str(averageloss))
        # _ = nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        optimizer.zero_grad()

以下是自定义模型:

seq2seqxentropy.py(net对象的类)

class Seq2SeqXentropy(nn.Module):

    def __init__(self, emb_size, hid_size, output_size):
        super(Seq2SeqXentropy, self).__init__()
        self.encoder = Encoder(emb_size, hid_size)
        self.decoder = Decoder(emb_size, hid_size, output_size)

    def forward(self, input_sentence, target_sentence):
        hidden_state = self.encoder(input_sentence)
        output_sentence = self.decoder(hidden_state, target_sentence)
        return output_sentence

encoder.py

class Encoder(nn.Module):

    def __init__(self, emb_size, hid_size):
        super(Encoder, self).__init__()
        self.hid_size = hid_size
        self.rnn = EncoderRNNCell(emb_size, hid_size)

    def forward(self, sentence):
        hidden_state = torch.rand(self.hid_size)
        for word in sentence:
            hidden_state = self.rnn(word, hidden_state)
        return hidden_state

decoder.py

class Decoder(nn.Module):

    def __init__(self, emb_size, hid_size, output_size):
        super(Decoder, self).__init__()
        self.rnn = DecoderRNNCell(emb_size, hid_size, output_size)
        self.output = []

    def forward(self, hidden_state, targetsentence):
        output = []
        for x in targetsentence:
            hidden_state, y = self.rnn(x, hidden_state)
            output.append(y)
        return output

rnncell.py

class EncoderRNNCell(nn.Module):

    def __init__(self, emb_size, hid_size):
        super(EncoderRNNCell, self).__init__()
        self.w = nn.Parameter(torch.rand(hid_size, emb_size))
        self.u = nn.Parameter(torch.rand(hid_size, hid_size))

    def forward(self, x, h):
        h_f = nn.functional.relu(torch.mv(self.w,x.to_dense().type(torch.FloatTensor)) + torch.mv(self.u,h))
        return h_f

class DecoderRNNCell(nn.Module):

    def __init__(self, emb_size, hid_size, output_size):
        super(DecoderRNNCell, self).__init__()
        self.w = nn.Parameter(torch.rand(hid_size, emb_size))
        self.u = nn.Parameter(torch.rand(hid_size, hid_size))
        self.v = nn.Parameter(torch.rand(output_size, hid_size))

    def forward(self, x, h):
        h_f = nn.functional.relu(torch.mv(self.w,x.to_dense().type(torch.FloatTensor))+torch.mv(self.u,h))
        y = nn.functional.softmax(nn.functional.relu(torch.mv(self.v,h_f)))
        return h_f, y 

0 个答案:

没有答案