当LSTM层在PyTorch中由其自己的输出馈送时返回nan

时间:2018-07-01 18:29:38

标签: deep-learning lstm pytorch

我正尝试使用https://arxiv.org/pdf/1308.0850.pdf中所述的LSTM和混合物密度网络生成时间序列数据 这是我的实现的链接:https://github.com/NeoVand/MDNLSTM 该存储库包含一个玩具数据集,用于训练网络。 在训练中,LSTM层在迭代一次后将nan返回为其隐藏状态。报告了类似的问题here。 为了您的方便,下面是代码:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as npr

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
ts = torch.load('LDS_Toy_Data.pt')

def detach(states):
    return [state.detach() for state in states]

class MDNLSTM(nn.Module):
    def __init__(self, d_obs, d_lat=2, n_gaussians=2, n_layers=1):
        super(MDNLSTM, self).__init__()

        self.d_obs = d_obs
        self.d_lat = d_lat
        self.n_gaussians = n_gaussians
        self.n_layers = n_layers

        self.lstm = nn.LSTM(d_obs, d_lat, n_layers, batch_first=True)
        self.fcPi = nn.Linear(d_lat, n_gaussians*d_obs)
        self.fcMu = nn.Linear(d_lat, n_gaussians*d_obs)
        self.fcSigma = nn.Linear(d_lat, n_gaussians*d_obs)

    def get_mixture_coef(self, y):
        time_steps = y.size(1)
        pi, mu, sigma = self.fcPi(y), self.fcMu(y), self.fcSigma(y)

        pi = pi.view(-1, time_steps, self.n_gaussians, self.d_obs)
        mu = mu.view(-1, time_steps, self.n_gaussians, self.d_obs)
        sigma = sigma.view(-1, time_steps, self.n_gaussians, self.d_obs)

        pi = F.softmax(pi, 2)
        sigma = torch.exp(sigma)
        return pi, mu, sigma


    def forward(self, x, h):
        y, (h, c) = self.lstm(x, h)
        #print(h)
        pi, mu, sigma = self.get_mixture_coef(y)
        return (pi, mu, sigma), (h, c)

    def init_hidden(self, bsz):
        return (torch.zeros(self.n_layers, bsz, self.d_lat).to(device),
                torch.zeros(self.n_layers, bsz, self.d_lat).to(device))

def mdn_loss_fn(y, pi, mu, sigma):
    m = torch.distributions.Normal(loc=mu, scale=sigma)
    loss = torch.exp(m.log_prob(y))
    loss = torch.sum(loss * pi, dim=2)
    loss = -torch.log(loss)
    return loss.mean()

def criterion(y, pi, mu, sigma):
    y = y.unsqueeze(2)
    return mdn_loss_fn(y, pi, mu, sigma)

DOBS = 10
DLAT = 2
INSTS = 100
seqlen = 30
epochs = 200


mdnlstm = MDNLSTM(DOBS, DLAT).to(device)
optimizer = torch.optim.Adam(mdnlstm.parameters())

z =  torch.from_numpy(ts[:INSTS,:,:]).float().to(device)

# hiddens=[]
# Train the model
for epoch in range(epochs):
    # Set initial hidden and cell states
    hidden = mdnlstm.init_hidden(INSTS)

    for i in range(0, z.size(1) - seqlen, seqlen):
        # Get mini-batch inputs and targets
        inputs = z[:, i:i+seqlen, :]
        targets = z[:, (i+1):(i+1)+seqlen, :]

        hidden = detach(hidden)
#         hiddens.append(hidden)
        (pi, mu, sigma), hidden = mdnlstm(inputs, hidden)
        loss = criterion(targets, pi, mu, sigma)
        mdnlstm.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 100 == 0:
        print ('Epoch [{}/{}], Loss: {:.4f}'
               .format(epoch, epochs, loss.item()))

在此方面,我将不胜感激。

1 个答案:

答案 0 :(得分:2)

该问题是由于log-sum-exp操作未稳定完成所致。这是我使用的加权log-sum-exp技巧的实现,可以解决问题:

def weighted_logsumexp(x,w, dim=None, keepdim=False):
    if dim is None:
        x, dim = x.view(-1), 0
    xm, _ = torch.max(x, dim, keepdim=True)
    x = torch.where(
        # to prevent nasty nan's
        (xm == float('inf')) | (xm == float('-inf')),
        xm,
        xm + torch.log(torch.sum(torch.exp(x - xm)*w, dim, keepdim=True)))
    return x if keepdim else x.squeeze(dim)

并使用该函数实现了稳定的损失函数:

def mdn_loss_stable(y,pi,mu,sigma):
    m = torch.distributions.Normal(loc=mu, scale=sigma)
    m_lp_y = m.log_prob(y)
    loss = -weighted_logsumexp(m_lp_y,pi,dim=2)
    return loss.mean()

这就像一种魅力。通常,问题是割炬不会报告下溢。