我正在Matlab和Tensorflow中为CIFAR-10分类训练CNN:CNN对我来说看起来是一样的:
Tensorflow
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, Dropout, MaxPool2D, Flatten, Dense
import numpy as np
import datetime
import os
cifar10 = tf.keras.datasets.cifar10
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
base_lr = 0.005
for k in range(1):
lr = base_lr * (0.9 ** k) # lr = base_lr
model = Sequential()
# Layer 1
model.add(Conv2D(input_shape=(32, 32, 3), filters=128, kernel_size=5, strides=(1, 1), padding="valid",
kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01),
bias_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01)))
model.add(BatchNormalization(epsilon=1e-5, momentum=0.0))
model.add(Activation("relu"))
model.add(MaxPool2D(pool_size=(3, 3), strides=(1, 1), padding="valid"))
# Layer 2
model.add(Conv2D(filters=128, kernel_size=5, strides=(1, 1), padding="valid",
kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01),
bias_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01)))
model.add(BatchNormalization(epsilon=1e-5, momentum=0.0))
model.add(Activation("relu"))
model.add(MaxPool2D(pool_size=(3, 3), strides=(1, 1), padding="valid"))
#Layer 3
model.add(Conv2D(filters=128, kernel_size=5, strides=(1, 1), padding="valid",
kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01),
bias_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01)))
model.add(BatchNormalization(epsilon=1e-5, momentum=0.0))
model.add(Activation("relu"))
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="valid"))
# Fully Connected layer
model.add(Flatten())
layer_length = model.layers[-1].output_shape[-1]#layer_length=8192
model.add(Dense(10, kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=1/np.sqrt(layer_length)),
bias_initializer=tf.random_normal_initializer(mean=0.0, stddev=1/np.sqrt(10)), activation="softmax"))
model.compile(optimizer=tf.keras.optimizers.SGD(lr, 0.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
sub_dir = datetime.datetime.strftime(datetime.datetime.now(), "%H_%M_%S_%m_%d_%y")
log_dir = "D:\\DataSet\\CIFAR10\\TensorBoard\\" + sub_dir
os.mkdir(log_dir)
model.fit(x_train, y_train, batch_size=256, epochs=100, validation_data=(x_test, y_test), verbose=2,
callbacks=[tf.keras.callbacks.TensorBoard(log_dir=log_dir)])
MATLAB
Specs.Conv1Channels = 128;
Specs.Conv2Channels = 128;
Specs.Conv3Channels = 128;
Specs.DropOutProb = 0;%no dropout
Specs.Momentum = 0;%no momentum
ImSize = [32 32 3];
MaxEpochs = 100;
num_of_batches = 5;
B = cell(num_of_batches,1);
for kBatch = 1:num_of_batches
B{kBatch} = load(['D:\DataSet\CIFAR10\cifar-10-batches-mat\data_batch_',num2str(kBatch),'.mat']);
end
C = cell2mat(B(:));
X = double(rot90(reshape(cell2mat({C.data}')',ImSize(1),ImSize(2),ImSize(3),[]),-1))/255;
Y = categorical(cell2mat({C.labels}'));
T = {load('D:\DataSet\CIFAR10\cifar-10-batches-mat\test_batch.mat')};
C = cell2mat(T(:));
Xt = double(rot90(reshape(cell2mat({C.data}')',ImSize(1),ImSize(2),ImSize(3),[]),-1))/255;
Yt = categorical(cell2mat({C.labels}'));
MiniBatchSizes = 256;
lrs = 0.005;
for lr = 1:length(lrs)
for mbs = 1:length(MiniBatchSizes)
IterPerEpoch = ceil(size(X,4) / MiniBatchSizes(mbs));
layers(1) = imageInputLayer(ImSize,'Name','input','Normalization','none');
% Layer 1
Conv1KernelSize = 5;
layers(2) = convolution2dLayer(Conv1KernelSize,Specs.Conv1Channels,'Name','conv1');
LOutSize = [ImSize(1) - Conv1KernelSize + 1 ImSize(2) - Conv1KernelSize + 1 ImSize(3) Specs.Conv1Channels];
layers(3) = batchNormalizationLayer;
layers(4) = reluLayer('Name','relu1');
layers(5) = dropoutLayer(Specs.DropOutProb);
layers(6) = maxPooling2dLayer(3,'Stride',1,'Name','pool1');
LOutSize = LOutSize - [2 2 0 0];
%Layer 2
Conv2KernelSize = 5;
layers(7) = convolution2dLayer(Conv2KernelSize,Specs.Conv2Channels,'Name','conv2');
LOutSize = [LOutSize(1) - Conv2KernelSize + 1 LOutSize(2) - Conv2KernelSize + 1 LOutSize(4) Conv2KernelSize];
layers(8) = batchNormalizationLayer;
layers(9) = reluLayer('Name','relu2');
layers(10) = dropoutLayer(Specs.DropOutProb);
layers(11) = maxPooling2dLayer(3,'Stride',1,'Name','pool2');
LOutSize = LOutSize - [2 2 0 0];
%Layer 3
Conv3KernelSize = 5;
layers(12) = convolution2dLayer(Conv3KernelSize,Specs.Conv3Channels,'Name','conv3');
LOutSize = [LOutSize(1) - Conv3KernelSize + 1 LOutSize(2) - Conv3KernelSize + 1 LOutSize(4) Conv3KernelSize];
layers(13) = batchNormalizationLayer;
layers(14) = reluLayer('Name','relu3');
layers(15) = dropoutLayer(Specs.DropOutProb);
layers(16) = maxPooling2dLayer(2,'Stride',2,'Name','pool3');
LOutSize = [LOutSize(1:2) / 2 LOutSize(3:4)];
%FC Layer
layers(17) = fullyConnectedLayer(10,'Name','fc4');
layers(18) = softmaxLayer('Name','softmax');
layers(19) = classificationLayer('Name','outlayer');
%Initialization
layers(2).Weights = 0.01 * randn([Conv1KernelSize Conv1KernelSize 3 Specs.Conv1Channels]);
layers(2).Bias = 0.01 * randn([1 1 Specs.Conv1Channels]);
layers(7).Weights = 0.01 * randn([Conv2KernelSize Conv2KernelSize Specs.Conv1Channels Specs.Conv2Channels]);
layers(7).Bias = 0.01 * randn([1 1 Specs.Conv2Channels]);
layers(12).Weights = 0.01 * randn([Conv3KernelSize Conv3KernelSize Specs.Conv2Channels Specs.Conv3Channels]);
layers(12).Bias = 0.01 * randn([1 1 Specs.Conv3Channels]);
%LOutSize(1) * LOutSize(2) * Specs.Conv3Channels = 8192
layers(17).Weights = 1/ sqrt(LOutSize(1) * LOutSize(2) * Specs.Conv3Channels) * randn(10,LOutSize(1) * LOutSize(2) * Specs.Conv3Channels);
layers(17).Bias = 1/ sqrt(10) * randn(10,1);
options = trainingOptions('sgdm', ...
'MaxEpochs',MaxEpochs,
'InitialLearnRate',lrs(lr),...
'MiniBatchSize',MiniBatchSizes(mbs),...
'Momentum',Specs.Momentum,...
'Shuffle','every-epoch', ...
'ValidationData',{Xt,Yt},...
'ValidationFrequency',1 * IterPerEpoch,...
'VerboseFrequency',1 * IterPerEpoch,...
'Plots','none',...
'LearnRateSchedule','none',...
'ExecutionEnvironment','gpu',...
'L2Regularization',0.0);%no l2 reg.
trainedNet = trainNetwork(X,Y,layers,options);
end
end
由于某些原因,我无法理解TF的验证精度存在巨大的波动,而MATLAB的验证精度始终看起来不错且很流畅:
我在这里想念什么?