Question

我试图使用Theano训练一个非常标准的MLP模型。我的模型代码看起来像这样

class Layer(object):
    def __init__(self, inputs, n_in, n_out, activation=T.nnet.softmax):
        def weights(shape):
            return np.array(np.random.uniform(size=shape), dtype='float64')
        def biases(size):
            return np.zeros((size), dtype='float64')

        self.W = theano.shared(value=weights((n_in, n_out)), name='weights', borrow=True)
        self.b = theano.shared(value=biases(n_out), name='biases', borrow=True)
        self.output = activation(T.dot(inputs, self.W) + self.b)
        self.pred = T.argmax(self.output, axis=1)
        self.params = [self.W, self.b]

class MLP(object):
    def __init__(self, inputs, n_in, n_hidden, n_out):
        """ for now lets go with one hidden layer"""
        self._hidden = Layer(inputs, n_in, n_hidden, activation=T.tanh)
        self._output = Layer(self._hidden.output, n_hidden, n_out) # softmax by default        
    def loss(self, one_hot):
        return T.mean(T.sqr(one_hot - self._output.output)    
    def accuracy(self, y):
        return T.mean(T.eq(self._output.pred, y))    
    def updates(self, loss, rate=0.01):
        updates = []
        updates.append((self._hidden.W, self._hidden.W - rate * T.grad(cost=loss, wrt=self._hidden.W)))
        updates.append((self._hidden.b, self._hidden.b - rate * T.grad(cost=loss, wrt=self._hidden.b)))
        updates.append((self._output.W, self._output.W - rate * T.grad(cost=loss, wrt=self._output.W)))
        updates.append((self._output.b, self._output.b - rate * T.grad(cost=loss, wrt=self._output.b)))
        return updates

然后我尝试像这样训练

x = T.matrix('x', dtype='float64')
y = T.vector('y', dtype='int32')

# basic logistic model
# model = Layer(x, 784, 10, activation=T.nnet.softmax)
# basic multi-layer perceptron
model = MLP(x, 784, 128, 10)

labels = T.extra_ops.to_one_hot(y, 10)
# loss function
#loss = T.mean(T.sqr(labels - model.output))
loss = model.loss(labels)
# average number of correct predictions over a batch
#accuracy = T.mean(T.eq(model.pred, y))
accuracy = model.accuracy(y)

# updates
#rate = 0.05
#g_W = T.grad(cost=loss, wrt=model.W)
#g_b = T.grad(cost=loss, wrt=model.b)
#updates = [(model.W, model.W - rate * g_W),
#           (model.b, model.b - rate * g_b)]
updates = model.updates(loss, rate=0.3)

# batch index
index = T.scalar('batch index', dtype='int32')
size = T.scalar('batch size', dtype='int32')

train = theano.function([index, size], 
                        [loss, accuracy],
                        updates=updates,
                        givens={x: train_set[0][index * size: (index + 1) * size],
                                y: train_set[1][index * size: (index + 1) * size]})

valid = theano.function([index, size], 
                        [loss, accuracy],
                        givens={x: valid_set[0][index * size: (index + 1) * size],
                                y: valid_set[1][index * size: (index + 1) * size]})

test = theano.function([index, size], 
                       [accuracy],
                       givens={x: test_set[0][index * size: (index + 1) * size],
                               y: test_set[1][index * size: (index + 1) * size]})

n_epochs = 10
batch_size = 500
# number of items in training dataset / batch size
batches_in_epoch = datasets[0][0].shape[0] // batch_size

losses = np.empty(0)
errors = np.empty(0)

for epoch in range(1, n_epochs + 1):
    epoch_losses = np.empty(0)
    epoch_errors = np.empty(0)
    for batch_n in range(batches_in_epoch):
        l, e = train(batch_n, batch_size)
        epoch_losses = np.append(epoch_losses, l)
        epoch_errors = np.append(epoch_errors, e)
        print('[%s]' % time.ctime(), 
              'epoch: ', epoch, 
              'batch: ', batch_n, 
              'loss: ', np.round(l, 4), 
              'accuracy: ', np.round(e, 4))
    # shuffle train set every epoch
    shuffle = np.arange(datasets[0][1].shape[0])
    np.random.shuffle(shuffle)
    train_set[0] = train_set[0][shuffle]
    train_set[1] = train_set[1][shuffle]

    losses = np.concatenate([losses, epoch_losses])
    errors = np.concatenate([errors, epoch_errors])
    valid_l, valid_e = valid(0, datasets[1][0].shape[0])
    print('[%s]' % time.ctime(), 'epoch: ', epoch, 'validation loss: ', valid_l, 'validation accuracy: ', valid_e)

acc = test(0, datasets[2][0].shape[0])
print()
print('Final accuracy: ', np.round(acc, 4)[0])

现在，如果你看一下评论，我用基本的逻辑回归模型尝试了它并且它有效，我得到了80％的准确率。但是当我用我的MLP模型替换它时，它不起作用。它没有收敛到任何东西，我得到10％的准确率随机猜测。我究竟做错了什么？我正在使用的数据是按照Theano教程的方式加载到共享变量中的MNIST数据集。

Answer 1

问题似乎在于权重初始化。你是如何在tensorflow实现中做到这一点的？

我现在对基础数学不太了解，所以如果我错了就纠正我，但我喜欢解释它，就像所有权重都是正数一样，模型无法学习负面特征。

您可以尝试将low=-1, high=1添加到初始化（默认值np.random.uniform介于0和1之间）。在我的测试中，这需要很长时间才能收敛（约100个时期），但至少它确实如此。

使用像这样更聪明的glorot initialization：

def weights(shape):
    return np.random.uniform(low=-np.sqrt(6. / sum(shape)),
                             high=np.sqrt(6. / sum(shape)),
                             size=shape)

使训练更快。在将5个时代添加到您的代码后，我获得了大约90％的验证准确度。

这也是theano MLP example中权重初始化的方式。

在Theano训练MLP

1 个答案: