Question

我正在尝试实现混合密度网络 (MDN)，它可以学习混合高斯分布。我想使用 MDN 来拟合条件概率分布 (p(y|x))。模型输入 x 的维度为 (batch_size, 1)，y (label) 为 (batch_size, 1)。这是我的代码。

数据有问题吗？当我使用生成的数据集时，结果是正确的。

谢谢。

class MDN(nn.Module):
    """A mixture density network layer
    The input maps to the parameters of a MoG probability distribution, where
    each Gaussian has O dimensions and diagonal covariance.
    Arguments:
        in_features (int): the number of dimensions in the input
        out_features (int): the number of dimensions in the output
        num_gaussians (int): the number of Gaussians per output dimensions
    Input:
        minibatch (BxD): B is the batch size and D is the number of input
            dimensions.
    Output:
        (pi, sigma, mu) (BxG, BxGxO, BxGxO): B is the batch size, G is the
            number of Gaussians, and O is the number of dimensions for each
            Gaussian. Pi is a multinomial distribution of the Gaussians. Sigma
            is the standard deviation of each Gaussian. Mu is the mean of each
            Gaussian.
    """

    def __init__(self, n_hidden, num_gaussians, in_features=1, out_features=1):
        super(MDN, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.num_gaussians = num_gaussians
        self.z_h = nn.Sequential(
            nn.Linear(1, n_hidden)
        )
        self.pi = nn.Sequential(
            nn.Linear(n_hidden, num_gaussians),
            nn.Softmax(dim=1)
        )
        self.sigma = nn.Linear(n_hidden, out_features * num_gaussians)
        self.mu = nn.Linear(n_hidden, out_features * num_gaussians)

    def forward(self, minibatch):
        z_h = self.z_h(minibatch)
        pi = self.pi(z_h)
        
        sigma = torch.exp(self.sigma(z_h))
        sigma = sigma.view(-1, self.num_gaussians, self.out_features)
        mu = self.mu(z_h)
        mu = mu.view(-1, self.num_gaussians, self.out_features)
        return pi, sigma, mu

ONEOVERSQRT2PI = 1.0 / math.sqrt(2 * math.pi)
def gaussian_probability(sigma, mu, target):
    """Returns the probability of `target` given MoG parameters `sigma` and `mu`.
    Arguments:
        sigma (BxGxO): The standard deviation of the Gaussians. B is the batch
            size, G is the number of Gaussians, and O is the number of
            dimensions per Gaussian.
        mu (BxGxO): The means of the Gaussians. B is the batch size, G is the
            number of Gaussians, and O is the number of dimensions per Gaussian.
        target (BxI): A batch of target. B is the batch size and I is the number of
            input dimensions.
    Returns:
        probabilities (BxG): The probability of each point in the probability
            of the distribution in the corresponding sigma/mu index.
    """
    target = target.unsqueeze(1).expand_as(sigma)
    ret = ONEOVERSQRT2PI * torch.exp(-0.5 * ((target - mu) / sigma)**2) / sigma
    return torch.prod(ret, 2)


def mdn_loss(pi, sigma, mu, target):
    """Calculates the error, given the MoG parameters and the target
    The loss is the negative log likelihood of the data given the MoG
    parameters.
    """
    prob = pi * gaussian_probability(sigma, mu, target)
    
    nll = -torch.log(torch.sum(prob, dim=1))
    return torch.mean(nll)
network = MDN(n_hidden=20, num_gaussians=5)
optimizer = torch.optim.Adam(network.parameters(), lr=0.0001)

data = pd.read_csv("./experiment/data/sorted_lineitem_1g_quantity_extendedprice.csv")
data.sort_values("extendedprice", inplace=True)
data.reset_index(drop=True, inplace=True)
sample = data.sample(frac=0.001)

quantity = np.array(sample["quantity"])
extendedprice = np.array(sample["extendedprice"])

max_x = np.max(extendedprice)
min_x = np.min(extendedprice)
mean_x = np.mean(extendedprice)
stdev_x = np.std(extendedprice)
max_y = np.max(quantity)
min_y = np.min(quantity)
mean_y = np.mean(quantity)
stdev_y = np.std(quantity)
# mean_y = np.mean(quantity)

def normalize(x, min_x, max_x):
    return (x - min_x) / (max_x - min_x)

def standardization(x, mean, stdev):
    return (x - mean) / stdev

x_data = np.array([standardization(i, mean_x, stdev_x) for i in extendedprice])
# x_data = extendedprice
y_data = np.array([standardization(i, mean_y, stdev_y) for i in quantity])
# y_data = quantity

x_tensor = torch.from_numpy(np.float32(x_data).reshape(-1, 1))
y_tensor = torch.from_numpy(np.float32(y_data).reshape(-1, 1))

mdn_x_data = x_data
mdn_y_data = y_data

mdn_x_tensor = x_tensor
mdn_y_tensor = y_tensor

x_variable = torch.tensor(mdn_x_tensor)
y_variable = torch.tensor(mdn_y_tensor, requires_grad=False)
def train_mdn():
    for epoch in range(10000):
        pi_variable, sigma_variable, mu_variable = network(x_variable)
        loss = mdn_loss(pi_variable, sigma_variable, mu_variable, y_variable)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % 500 == 0:
            print(epoch, loss.item())

train_mdn()

代码结果

0 1.9179250001907349
500 0.6071352362632751
1000 0.12722773849964142
1500 0.05316021665930748
2000 0.026438187807798386
2500 0.011702506802976131
3000 -0.00010265530727338046
3500 -0.011923379264771938

我的问题是：为什么损失函数的值随着训练过程变成负数？

混合密度网络的负对数似然损失函数中的负值

0 个答案: