我在玩具回归问题上对Keras(与Theano)和Lasagne进行了比较,以便为我的最终应用选择其中一个。通过这种比较,我发现Lasagne的表现比Keras差得多,我开始怀疑我的代码。由于我对Keras和Lasagne都很陌生,所以我想和比我更有经验的人检查一下。应训练网络以找到16x16矩阵的平均值。我做了不同的尝试:首先,尝试使用2D转换层+密集层(因为我的最终应用程序将需要使用CNN)。然后,由于烤宽面条结果很糟糕,我尝试使用标准的单层MLP。再次,糟糕的烤宽面条表现。我尝试在两种情况下使用相同的规格:相同的批量大小,相同的初始化,相同的优化器(测试SGD与Nesterov动量和ADAM),当然还有相同数量的时期和网络架构。有人能告诉我发生了什么事吗?我的代码中有什么问题吗?为什么性能差异如此之大?如果一切都正确,为什么Keras比Lasagne好得多?
这里是我正在使用的代码:
Keras:
# -*- coding: utf-8 -*-
import numpy as np
np.random.seed(1337) # for reproducibility
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.layers import Convolution2D
from keras import backend as K
from keras.optimizers import SGD
import matplotlib.pyplot as plt
batch_size = 500
nb_output = 1
nb_epoch = 10
# input image dimensions
img_rows, img_cols = 16, 16
# number of convolutional filters to use
nb_filters = 20
# size of pooling area for max pooling
pool_size = (2, 2)
# convolution kernel size
kernel_size = (3, 3)
X_train = np.random.randn(10000, 16*16)
Y_train = np.mean(X_train, 1)
X_train = X_train.astype('float32')
X_test = np.random.randn(1000, 16*16)
Y_test = np.mean(X_test, 1)
if K._BACKEND == 'theano':
X_train = np.reshape(X_train, (10000, 1, 16, 16))
X_test = np.reshape(X_test, (1000, 1, 16, 16))
else:
X_train = np.reshape(X_train, (10000, 16, 16, 1))
X_test = np.reshape(X_test, (1000, 16, 16, 1))
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
model = Sequential()
model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1],
border_mode='same',
input_shape=X_train.shape[1:], init='glorot_uniform'))
model.add(Activation('relu'))
#model.add(Flatten(input_shape=X_train.shape[1:]))
model.add(Flatten())
model.add(Dense(10, init='glorot_uniform'))
model.add(Activation('sigmoid'))
model.add(Dense(nb_output, init='glorot_uniform'))
model.add(Activation('linear'))
sgd = SGD(lr=0.1, momentum=0.9, nesterov=True)#decay=1e-6,
model.compile(loss='mse',
optimizer=sgd)
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=1)
predicts = model.predict(X_test, batch_size=1000, verbose=0)
print('Test score:', score[0])
plt.figure()
plt.scatter(Y_test, predicts)
烤宽面条(改编自mnist example):
# -*- coding: utf-8 -*-
from __future__ import print_function
import time
import numpy as np
import theano
import theano.tensor as T
import lasagne
import matplotlib.pyplot as plt
def load_dataset():
np.random.seed(1337)
X_train = np.random.randn(10000, 16*16)
X_train = X_train.astype('float32')
Y_train = np.mean(X_train, 1)
X_test = np.random.randn(1000, 16*16)
X_test = X_test.astype('float32')
Y_test = np.mean(X_test, 1)
X_train = np.reshape(X_train, (10000, 1, 16, 16))
X_test = np.reshape(X_test, (1000, 1, 16, 16))
return X_train, Y_train, X_test, Y_test
def build_cnn(input_var=None):
network = lasagne.layers.InputLayer(shape=(None, 1, 16, 16),
input_var=input_var)
network = lasagne.layers.Conv2DLayer(
network, num_filters=20, filter_size=(3, 3),
nonlinearity=lasagne.nonlinearities.rectify,
W=lasagne.init.GlorotUniform())
network = lasagne.layers.DenseLayer(
network,
num_units=10,
nonlinearity=lasagne.nonlinearities.sigmoid)
network = lasagne.layers.DenseLayer(
network,
num_units=1,
nonlinearity=lasagne.nonlinearities.linear)
return network
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
assert len(inputs) == len(targets)
if shuffle:
indices = np.arange(len(inputs))
np.random.shuffle(indices)
for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield inputs[excerpt], targets[excerpt]
def main(model='cnn', num_epochs=10):
print("Loading data...")
X_train, y_train, X_test, y_test = load_dataset()
input_var = T.tensor4('inputs')
target_var = T.vector('targets')
print("Building model and compiling functions...")
network = build_cnn(input_var)
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.squared_error(prediction, target_var)
loss = loss.mean()
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.nesterov_momentum(
loss, params, learning_rate=0.1, momentum=0.9)
# updates = lasagne.updates.adam(loss, params)
test_prediction = lasagne.layers.get_output(network)
test_loss = lasagne.objectives.squared_error(test_prediction,
target_var)
test_loss = test_loss.mean()
train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], test_loss)
preds = theano.function([input_var], test_prediction)
print("Starting training...")
for epoch in range(num_epochs):
train_err = 0.0
train_batches = 0
start_time = time.time()
for batch in iterate_minibatches(X_train, y_train, 500, shuffle=False):
inputs, targets = batch
train_err += train_fn(inputs, targets)
train_batches += 1
test_err = 0.0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
inputs, targets = batch
err = val_fn(inputs, targets)
test_err += err
test_batches += 1
print("Epoch {} of {} took {:.3f}s".format(
epoch + 1, num_epochs, time.time() - start_time))
print(" training loss:\t\t{:.6f}".format(train_err / train_batches))
print(" test loss:\t\t{:.6f}".format(test_err / test_batches))
pds = preds(X_test)
plt.scatter(y_test, pds)
plt.show()
if __name__ == '__main__':
main()
这两个代码都很容易适应单层MLP。如果你运行它们,你将在最后得到这个散点图:
烤宽面条:
keras:
。
在x轴上:在y轴预测值上的真值。