我试图使用Theano训练一个非常标准的MLP模型。我的模型代码看起来像这样
class Layer(object): def __init__(self, inputs, n_in, n_out, activation=T.nnet.softmax): def weights(shape): return np.array(np.random.uniform(size=shape), dtype='float64') def biases(size): return np.zeros((size), dtype='float64') self.W = theano.shared(value=weights((n_in, n_out)), name='weights', borrow=True) self.b = theano.shared(value=biases(n_out), name='biases', borrow=True) self.output = activation(T.dot(inputs, self.W) + self.b) self.pred = T.argmax(self.output, axis=1) self.params = [self.W, self.b] class MLP(object): def __init__(self, inputs, n_in, n_hidden, n_out): """ for now lets go with one hidden layer""" self._hidden = Layer(inputs, n_in, n_hidden, activation=T.tanh) self._output = Layer(self._hidden.output, n_hidden, n_out) # softmax by default def loss(self, one_hot): return T.mean(T.sqr(one_hot - self._output.output) def accuracy(self, y): return T.mean(T.eq(self._output.pred, y)) def updates(self, loss, rate=0.01): updates = [] updates.append((self._hidden.W, self._hidden.W - rate * T.grad(cost=loss, wrt=self._hidden.W))) updates.append((self._hidden.b, self._hidden.b - rate * T.grad(cost=loss, wrt=self._hidden.b))) updates.append((self._output.W, self._output.W - rate * T.grad(cost=loss, wrt=self._output.W))) updates.append((self._output.b, self._output.b - rate * T.grad(cost=loss, wrt=self._output.b))) return updates
然后我尝试像这样训练
x = T.matrix('x', dtype='float64') y = T.vector('y', dtype='int32') # basic logistic model # model = Layer(x, 784, 10, activation=T.nnet.softmax) # basic multi-layer perceptron model = MLP(x, 784, 128, 10) labels = T.extra_ops.to_one_hot(y, 10) # loss function #loss = T.mean(T.sqr(labels - model.output)) loss = model.loss(labels) # average number of correct predictions over a batch #accuracy = T.mean(T.eq(model.pred, y)) accuracy = model.accuracy(y) # updates #rate = 0.05 #g_W = T.grad(cost=loss, wrt=model.W) #g_b = T.grad(cost=loss, wrt=model.b) #updates = [(model.W, model.W - rate * g_W), # (model.b, model.b - rate * g_b)] updates = model.updates(loss, rate=0.3) # batch index index = T.scalar('batch index', dtype='int32') size = T.scalar('batch size', dtype='int32') train = theano.function([index, size], [loss, accuracy], updates=updates, givens={x: train_set[0][index * size: (index + 1) * size], y: train_set[1][index * size: (index + 1) * size]}) valid = theano.function([index, size], [loss, accuracy], givens={x: valid_set[0][index * size: (index + 1) * size], y: valid_set[1][index * size: (index + 1) * size]}) test = theano.function([index, size], [accuracy], givens={x: test_set[0][index * size: (index + 1) * size], y: test_set[1][index * size: (index + 1) * size]}) n_epochs = 10 batch_size = 500 # number of items in training dataset / batch size batches_in_epoch = datasets[0][0].shape[0] // batch_size losses = np.empty(0) errors = np.empty(0) for epoch in range(1, n_epochs + 1): epoch_losses = np.empty(0) epoch_errors = np.empty(0) for batch_n in range(batches_in_epoch): l, e = train(batch_n, batch_size) epoch_losses = np.append(epoch_losses, l) epoch_errors = np.append(epoch_errors, e) print('[%s]' % time.ctime(), 'epoch: ', epoch, 'batch: ', batch_n, 'loss: ', np.round(l, 4), 'accuracy: ', np.round(e, 4)) # shuffle train set every epoch shuffle = np.arange(datasets[0][1].shape[0]) np.random.shuffle(shuffle) train_set[0] = train_set[0][shuffle] train_set[1] = train_set[1][shuffle] losses = np.concatenate([losses, epoch_losses]) errors = np.concatenate([errors, epoch_errors]) valid_l, valid_e = valid(0, datasets[1][0].shape[0]) print('[%s]' % time.ctime(), 'epoch: ', epoch, 'validation loss: ', valid_l, 'validation accuracy: ', valid_e) acc = test(0, datasets[2][0].shape[0]) print() print('Final accuracy: ', np.round(acc, 4)[0])
现在,如果你看一下评论,我用基本的逻辑回归模型尝试了它并且它有效,我得到了80%的准确率。但是当我用我的MLP模型替换它时,它不起作用。它没有收敛到任何东西,我得到10%的准确率随机猜测。我究竟做错了什么?我正在使用的数据是按照Theano教程的方式加载到共享变量中的MNIST数据集。
答案 0 :(得分:0)
问题似乎在于权重初始化。你是如何在tensorflow实现中做到这一点的?
我现在对基础数学不太了解,所以如果我错了就纠正我,但我喜欢解释它,就像所有权重都是正数一样,模型无法学习负面特征。
您可以尝试将low=-1, high=1
添加到初始化(默认值np.random.uniform
介于0和1之间)。
在我的测试中,这需要很长时间才能收敛(约100个时期),但至少它确实如此。
使用像这样更聪明的glorot initialization:
def weights(shape):
return np.random.uniform(low=-np.sqrt(6. / sum(shape)),
high=np.sqrt(6. / sum(shape)),
size=shape)
使训练更快。在将5个时代添加到您的代码后,我获得了大约90%的验证准确度。
这也是theano MLP example中权重初始化的方式。