Question

我正在尝试针对Street View House Numbers数据集训练keras CNN。您可以找到项目here。问题是在训练期间，损失和准确性都不会随着时间而改变。我尝试过1通道（灰度）图像，RGB（3通道）图像，更宽（50,50）和更小（28,28）的图像，在卷积层中有更多或更少的滤波器，更宽更小池化层中的补丁，有或没有丢失，批量越来越大，优化器的学习步骤越来越小，具有不同的优化器，...

培训仍然受到持续损失和准确性的影响

以下是我准备数据的方法

from PIL import Image
from PIL import ImageFilter
train_folders = 'sv_train/train'
test_folders = 'test'
extra_folders = 'extra'
SV_IMG_SIZE = 28
SV_CHANNELS = 3
train_imsize = np.ndarray([len(train_data),2])
k = 500
sv_images = []
max_images = 20000#len(train_data)
max_digits = 5
sv_labels = np.ones([max_images, max_digits], dtype=int) * 10 # init to 10 cause it would be no digit
nboxes = [[] for i in range(max_images)]
print ("%d to load" % len(train_data))
def getBBox(i,perc):

    boxes = train_data[i]['boxes'] 
    x_min=9990
    y_min=9990
    x_max=0
    y_max=0
    for bid,b in enumerate(boxes):
        x_min = b['left'] if b['left'] <= x_min else x_min
        y_min = b['top'] if b['top'] <= y_min else y_min
        x_max = b['left']+b['width'] if  b['left']+b['width'] >= x_max else x_max
        y_max = b['top']+b['height'] if b['top']+b['height'] >= y_max else y_max

    dy = y_max-y_min
    dx = x_max-x_min
    dpy = dy*perc
    dpx = dx*perc
    nboxes[i]=[dpx,dpy,dx,dy]
    return x_min-dpx, y_min-dpy, x_max+dpx, y_max+dpy

for i in range(max_images):
    print (" \r%d" % i ,end="")
    filename = train_data[i]['filename']
    fullname = os.path.join(train_folders, filename)
    boxes = train_data[i]['boxes']
    label = [10,10,10,10,10]
    lb = len(boxes)
    if lb <= max_digits:
        im = Image.open(fullname)
        x_min, y_min, x_max, y_max = getBBox(i,0.3)
        im = im.crop([x_min,y_min,x_max,y_max])
        owidth, oheight = im.size
        wr = SV_IMG_SIZE/float(owidth)
        hr = SV_IMG_SIZE/float(oheight)
        for bid,box in  enumerate(boxes):
            sv_labels[i][max_digits-lb+bid] = int(box['label'])

        box = nboxes[i]
        box[0]*=wr
        box[1]*=wr
        box[2]*=hr
        box[3]*=hr
        im = im.resize((SV_IMG_SIZE,SV_IMG_SIZE),Image.ANTIALIAS)
        array = np.asarray(im)
        array =  array.reshape((SV_IMG_SIZE,SV_IMG_SIZE,SV_CHANNELS)).astype(np.float32)
        na = np.zeros([SV_IMG_SIZE,SV_IMG_SIZE,SV_CHANNELS],dtype=int)
        sv_images.append(array.astype(np.float32))

这是模型

from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical

adam = Adam(lr=0.5)

model = Sequential()
x = Input((SV_IMG_SIZE, SV_IMG_SIZE,SV_CHANNELS))

y = Convolution2D(16, 3, 3, activation='relu', border_mode='same')(x)
y = Convolution2D(32, 3, 3, activation='relu', border_mode='valid')(y)
y = MaxPooling2D((2, 2))(y)
y = Convolution2D(128, 3, 3, activation='relu', border_mode='valid')(y)
y = MaxPooling2D((2, 2))(y)
y = Flatten()(y)
y = Dense(512, activation='relu')(y)


digit1 = Dense(11, activation="softmax")(y)
digit2 = Dense(11, activation="softmax")(y)
digit3 = Dense(11, activation="softmax")(y)
digit4 = Dense(11, activation="softmax")(y)
digit5 = Dense(11, activation="softmax")(y)
model = Model(input=x, output=[digit1, digit2, digit3,digit4,digit5])


model.compile(optimizer=adam,
          loss='categorical_crossentropy',
          metrics=['accuracy'])


sv_train_labels = [to_categorical(svt_labels[:,0]),
                   to_categorical(svt_labels[:,1]),
                   to_categorical(svt_labels[:,2]),
                   to_categorical(svt_labels[:,3]),
                   to_categorical(svt_labels[:,4])]
sv_validation_labels = [to_categorical(svv_labels[:,0]),
                        to_categorical(svv_labels[:,1]),
                        to_categorical(svv_labels[:,2]),
                        to_categorical(svv_labels[:,3]),
                        to_categorical(svv_labels[:,4])]

model.fit(sv_train, sv_train_labels, nb_epoch=50, batch_size=8,validation_data=(sv_validation, sv_validation_labels))

Answer 1

正如我上面的评论，我建议避免训练模型来预测5位数的组合。训练模型预测单个数字会更有效。我尝试根据cifar10_cnn.py上的Keras示例MNIST SHVN format 2 (cropped digits)构建快速示例：

import numpy as np
import scipy.io as sio
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils.np_utils import to_categorical

# parameters
nb_epoch = 10
batch_size = 32

# load data
nb_classes = 10
train_data = sio.loadmat('train_32x32.mat')
test_data = sio.loadmat('test_32x32.mat')
X_train = train_data['X'].T / 255
X_test = test_data['X'].T / 255
y_train = to_categorical(train_data['y'] % nb_classes)
y_test = to_categorical(test_data['y'] % nb_classes)

# model
model = Sequential()
model.add(Convolution2D(32, 3, 3, border_mode='same', input_shape=X_train.shape[1:]))
model.add(Activation('relu'))
model.add(Convolution2D(32, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Convolution2D(64, 3, 3, border_mode='same'))
model.add(Activation('relu'))
model.add(Convolution2D(64, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# train
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, y_test), shuffle=True)

培训模型后，使用recognize/extract each number from an image

等库将另一个模型训练到OpenCV

Answer 2

在这种情况下，大部分时间都是错误的训练集。我建议你看看你输入网络的实际图像和标签。另外，请查看图像的实际颜色条。这意味着了解他们的价值如何分配。这通常会导致解决方案。无论如何，如果你能够映射它们，那么计算机的学习率也会很高。

Answer 3

为什么104的标签为：[10 10 1 10 4]？我认为它应该是[10 10 1 0 4]，不是吗？

在我看来：要么您输入数据有问题（准备可能是错误的），要么您的架构不适合此问题。

正在训练，你可以在笔记本上看到1号和2号之间的损失有所改变。所以它不是训练问题。

Keras模型获得恒定的损失和准确性

3 个答案: