如何解决以下代码中的过拟合问题?

时间:2019-12-30 05:55:48

标签: python machine-learning

  

我正在尝试使用 IAM数据集建立手写单词识别   在训练期间,我遇到了过度拟合问题。你能不能   帮我弄清楚我在下面的代码中犯了什么错误。

我已经尝试了所有可以解决该问题的解决方案,但仍然存在相同的过拟合问题。

import os
import fnmatch
import cv2
import numpy as np
import string
import time
import random
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional, Dropout
from keras.models import Model
from keras.activations import relu, sigmoid, softmax
import keras.backend as K
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint



imgSize = (128,32)

def preprocess(img, imgSize, dataAugmentation=False):

    "put img into target img of size imgSize, transpose for TF and normalize gray-values"

    # there are damaged files in IAM dataset - just use black image instead
    if img is None:
        img = np.zeros([imgSize[1], imgSize[0]])

    # increase dataset size by applying random stretches to the images
    if dataAugmentation:
        stretch = (random.random() - 0.5) # -0.5 .. +0.5
        wStretched = max(int(img.shape[1] * (1 + stretch)), 1) # random width, but at least 1
        img = cv2.resize(img, (wStretched, img.shape[0])) # stretch horizontally by factor 0.5 .. 1.5
        img = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
        # print('Data Augmented')

    # create target image and copy sample image into it
    (wt, ht) = imgSize
    (h, w) = img.shape
    fx = w / wt
    fy = h / ht
    f = max(fx, fy)
    newSize = (max(min(wt, int(w / f)), 1), max(min(ht, int(h / f)), 1)) # scale according to f (result at least 1 and at most wt or ht)
    img = cv2.resize(img, newSize)
    target = np.ones([ht, wt]) * 255
    target[0:newSize[1], 0:newSize[0]] = img

    # transpose for TF
    img = cv2.transpose(target)

    # normalize
    (m, s) = cv2.meanStdDev(img)
    m = m[0][0]
    s = s[0][0]
    img = img - m
    img = img / s if s>0 else img
    img = np.expand_dims(img , axis = 2) 

    return img

def truncateLabel(text, maxTextLen):  # A,32

    cost = 0
    for i in range(len(text)):
        if i != 0 and text[i] == text[i-1]:
            cost += 2
        else:
            cost += 1
        if cost > maxTextLen:
            return text[:i]  # returns words with repeated chars
    return text

path = 'iam_dataset_words/'
maxTextLen = 32
samples = []
bad_samples = []
fileName = ''
dataAugmentation = False
chars = set()
f=open(path+ 'words.txt', "r")
cou = 0
bad_samples = []
bad_samples_reference = ['a01-117-05-02.png',
                                 'r06-022-03-05.png']
for line in f:
    cou+=1
    # ignore comment line
    if not line or line[0]=='#':
        continue

    lineSplit = line.strip().split(' ')
    assert len(lineSplit) >= 9

    fileNameSplit = lineSplit[0].split('-') #a01-000u-00-00 splits
    #../data/words/a01/a01-000u/a01-000u-00-00.png
    fileName = path + 'words/' \
                       + fileNameSplit[0] + '/' \
                       + fileNameSplit[0] + '-' \
                       + fileNameSplit[1] \
                       + '/' + lineSplit[0] + '.png'

    # GT text are columns starting at 9
    gtText = truncateLabel(' '.join(lineSplit[8:]), maxTextLen) #A,32
    #chars = chars.union(gtText) #unique chars only
    chars = chars.union(set(list(gtText)))

    # check if image is not empty
    if not os.path.getsize(fileName):
        bad_samples.append(lineSplit[0] + '.png')
        continue

    # put sample into list
    #'A','../data/words/a01/a01-000u/a01-000u-00-00.png'
    samples.append([gtText, fileName])

print(cou) 
print(len(samples))
print(samples[:2])

if set(bad_samples) != set(bad_samples_reference):
    print("Warning, damaged images found:", bad_samples)
    print("Damaged images expected:", bad_samples_reference)

train_samples = []
trainSamples = []
validationSamples = []
testSamples = []
valid_testSamples = []
# split into training and validation set: 90% - 10%
splitIdx = int(0.75 * len(samples))
train_samples = samples[:splitIdx]
valid_testSamples = samples[splitIdx:]
print('vv:', len(valid_testSamples))
validationSamples = valid_testSamples[:15000]
testSamples = valid_testSamples[15000:]
print('valid: ',len(validationSamples))
print('test: ',len(testSamples))
print('train_before: ',len(train_samples))

# # start with train set
dataAugmentation = True
random.shuffle(trainSamples)
trainSamples = train_samples[:25000] #tran data 25000
print('train_ after: ',len(trainSamples))
# # list of all unique chars in dataset
charList = sorted(list(chars))
char_list = str().join(charList)
# print('test samples: ',testSamples)
print('char list : ',char_list)

# # save characters of model for inference mode
# open(FilePaths.fnCharList, 'w').write(str().join(charList))
# # save words contained in dataset into file
# open(FilePaths.fnCorpus, 'w').write(str(' ').join(loader.trainWords + validationWords))

def encode_to_labels(txt):
    # encoding each output word into digits
    chars = []
    for index, char in enumerate(txt):
        try:
            chars.append(char_list.index(char))
        except:
            print(char)

    return  chars

print(trainSamples[:2])

# lists for training dataset
train_img = []
train_txt = []
train_input_length = []
train_label_length = []
train_orig_txt = []
max_label_len = 0
b = 0

for words, imgPath in trainSamples:
  img = preprocess(cv2.imread(imgPath, cv2.IMREAD_GRAYSCALE), imgSize, dataAugmentation = True)

  # compute maximum length of the text
  if len(words) > max_label_len:
    max_label_len = len(words)

  train_orig_txt.append(words)   
  train_label_length.append(len(words))
  train_input_length.append(31)
  train_img.append(img)
  train_txt.append(encode_to_labels(words)) 
  b+=1

# print(train_img[1])

print(len(train_txt))
train_txt[:5]

a = 0
#lists for validation dataset
valid_img = []
valid_txt = []
valid_input_length = []
valid_label_length = []
valid_orig_txt = []

for words, imgPath in validationSamples:
  img = preprocess(cv2.imread(imgPath, cv2.IMREAD_GRAYSCALE), imgSize, dataAugmentation = False)

  valid_orig_txt.append(words)   
  valid_label_length.append(len(words))
  valid_input_length.append(31)
  valid_img.append(img)
  valid_txt.append(encode_to_labels(words))
  a+=1

print(len(valid_txt))
valid_txt[:5]

# lists for training dataset
test_img = []
test_txt = []
test_input_length = []
test_label_length = []
test_orig_txt = []
c = 0

for words, imgPath in testSamples:
  img = preprocess(cv2.imread(imgPath, cv2.IMREAD_GRAYSCALE), imgSize, dataAugmentation = False)

  test_orig_txt.append(words)
  test_label_length.append(len(words))
  test_input_length.append(31)
  test_img.append(img)
  test_txt.append(encode_to_labels(words)) 
  c+=1
  # print(c)

print(test_img[0].shape)
print('Train: {}\nValid: {}\nTest: {}'.format(b,a,c))

print(max_label_len)
# pad each output label to maximum text length
train_padded_txt = pad_sequences(train_txt, maxlen=max_label_len, padding='post', value = len(char_list))
valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))
test_padded_txt = pad_sequences(test_txt, maxlen=max_label_len, padding='post', value = len(char_list))

print(len(train_padded_txt))
print(len(test_padded_txt))
print(valid_padded_txt[1]) 
inputs = Input(shape=(128,32,1))
print(inputs.shape)
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)

conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)

conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)

conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
pool_4 = MaxPool2D(pool_size=(1,2))(conv_4)

conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
batch_norm_5 = BatchNormalization()(conv_5)

conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_6)
pool_6 = MaxPool2D(pool_size=(1,2))(batch_norm_6)
conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)
# map-to-sequence-- dropping 1 dimension
squeezed = Lambda(lambda x: K.squeeze(x, 2))(conv_7)
blstm_1 = Bidirectional(LSTM(256, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(256, return_sequences=True, dropout = 0.5))(blstm_1)
outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)

# model to be used at test time
word_model = Model(inputs, outputs)

word_model.summary()

labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')


def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args

    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)


loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

#model to be used at training time
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)



model.compile(loss= {'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam', metrics = ['accuracy'])

filepath="best_model.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callbacks_list = [checkpoint]

train_img = np.array(train_img)
train_input_length = np.array(train_input_length)
train_label_length = np.array(train_label_length)

valid_img = np.array(valid_img)
valid_input_length = np.array(valid_input_length)
valid_label_length = np.array(valid_label_length)

test_img = np.array(test_img)
test_input_length = np.array(test_input_length)
test_label_length = np.array(test_label_length)

test_img.shape

batch_size = 50
epochs = 30
train_history = model.fit(x=[train_img, train_padded_txt, train_input_length, train_label_length], 
          y=np.zeros(len(train_img)), batch_size=batch_size, epochs = epochs, 
          validation_data = ([valid_img, valid_padded_txt, valid_input_length, 
                              valid_label_length], [np.zeros(len(valid_img))]), 
          verbose = 1, callbacks = callbacks_list)
  

训练25000个样本,验证15000个样本

Epoch 1/30
25000/25000 [==============================] - 162s 6ms/step - loss: 14.5529 - acc: 0.0108 - val_loss: 14.2458 - val_acc: 0.0556

Epoch 00001: val_loss improved from inf to 14.24584, saving model to best_model.hdf5
Epoch 2/30
25000/25000 [==============================] - 148s 6ms/step - loss: 11.4702 - acc: 0.0578 - val_loss: 9.5779 - val_acc: 0.1113

Epoch 00002: val_loss improved from 14.24584 to 9.57788, saving model to best_model.hdf5
Epoch 3/30
25000/25000 [==============================] - 147s 6ms/step - loss: 8.9270 - acc: 0.1106 - val_loss: 8.4131 - val_acc: 0.1333

Epoch 00003: val_loss improved from 9.57788 to 8.41311, saving model to best_model.hdf5
Epoch 4/30
25000/25000 [==============================] - 147s 6ms/step - loss: 6.4378 - acc: 0.1855 - val_loss: 6.1883 - val_acc: 0.2052

Epoch 00004: val_loss improved from 8.41311 to 6.18827, saving model to best_model.hdf5
Epoch 5/30
25000/25000 [==============================] - 147s 6ms/step - loss: 5.0134 - acc: 0.2529 - val_loss: 4.8411 - val_acc: 0.2675

Epoch 00005: val_loss improved from 6.18827 to 4.84106, saving model to best_model.hdf5
Epoch 6/30
25000/25000 [==============================] - 146s 6ms/step - loss: 4.1895 - acc: 0.3060 - val_loss: 4.9187 - val_acc: 0.2894

Epoch 00006: val_loss did not improve from 4.84106
Epoch 7/30
25000/25000 [==============================] - 145s 6ms/step - loss: 3.6483 - acc: 0.3517 - val_loss: 4.7853 - val_acc: 0.2939

Epoch 00007: val_loss improved from 4.84106 to 4.78526, saving model to best_model.hdf5
Epoch 8/30
25000/25000 [==============================] - 144s 6ms/step - loss: 3.2317 - acc: 0.3910 - val_loss: 4.9047 - val_acc: 0.3210

Epoch 00008: val_loss did not improve from 4.78526
Epoch 9/30
25000/25000 [==============================] - 144s 6ms/step - loss: 2.8555 - acc: 0.4282 - val_loss: 4.9005 - val_acc: 0.3301

Epoch 00009: val_loss did not improve from 4.78526
Epoch 10/30
25000/25000 [==============================] - 144s 6ms/step - loss: 2.6267 - acc: 0.4542 - val_loss: 5.0818 - val_acc: 0.3391

Epoch 00010: val_loss did not improve from 4.78526
Epoch 11/30
25000/25000 [==============================] - 145s 6ms/step - loss: 2.3906 - acc: 0.4847 - val_loss: 4.7551 - val_acc: 0.3576

Epoch 00011: val_loss improved from 4.78526 to 4.75513, saving model to best_model.hdf5
Epoch 12/30
25000/25000 [==============================] - 145s 6ms/step - loss: 2.1440 - acc: 0.5180 - val_loss: 4.5446 - val_acc: 0.3958

Epoch 00012: val_loss improved from 4.75513 to 4.54463, saving model to best_model.hdf5
Epoch 13/30
25000/25000 [==============================] - 146s 6ms/step - loss: 1.9453 - acc: 0.5446 - val_loss: 5.2534 - val_acc: 0.3459

Epoch 00013: val_loss did not improve from 4.54463
Epoch 14/30
25000/25000 [==============================] - 146s 6ms/step - loss: 1.7780 - acc: 0.5714 - val_loss: 4.7764 - val_acc: 0.3819

Epoch 00014: val_loss did not improve from 4.54463
Epoch 15/30
25000/25000 [==============================] - 145s 6ms/step - loss: 1.6636 - acc: 0.5872 - val_loss: 5.2399 - val_acc: 0.3673

Epoch 00015: val_loss did not improve from 4.54463
Epoch 16/30
25000/25000 [==============================] - 144s 6ms/step - loss: 1.5140 - acc: 0.6113 - val_loss: 5.1928 - val_acc: 0.3815

Epoch 00016: val_loss did not improve from 4.54463
Epoch 17/30
25000/25000 [==============================] - 144s 6ms/step - loss: 1.4235 - acc: 0.6244 - val_loss: 5.3105 - val_acc: 0.3917

Epoch 00017: val_loss did not improve from 4.54463
Epoch 18/30
25000/25000 [==============================] - 144s 6ms/step - loss: 1.3314 - acc: 0.6441 - val_loss: 5.3560 - val_acc: 0.4027

Epoch 00018: val_loss did not improve from 4.54463
Epoch 19/30
25000/25000 [==============================] - 144s 6ms/step - loss: 1.2458 - acc: 0.6617 - val_loss: 5.0711 - val_acc: 0.4133

Epoch 00019: val_loss did not improve from 4.54463
Epoch 20/30
25000/25000 [==============================] - 144s 6ms/step - loss: 1.1967 - acc: 0.6646 - val_loss: 5.3788 - val_acc: 0.4114

Epoch 00020: val_loss did not improve from 4.54463
Epoch 21/30
25000/25000 [==============================] - 144s 6ms/step - loss: 1.1216 - acc: 0.6842 - val_loss: 6.0138 - val_acc: 0.3851

Epoch 00021: val_loss did not improve from 4.54463
Epoch 22/30
25000/25000 [==============================] - 144s 6ms/step - loss: 1.0586 - acc: 0.6968 - val_loss: 5.3665 - val_acc: 0.4191

Epoch 00022: val_loss did not improve from 4.54463
Epoch 23/30
25000/25000 [==============================] - 144s 6ms/step - loss: 1.0066 - acc: 0.7044 - val_loss: 5.2639 - val_acc: 0.4236

Epoch 00023: val_loss did not improve from 4.54463
Epoch 24/30
25000/25000 [==============================] - 144s 6ms/step - loss: 0.9834 - acc: 0.7104 - val_loss: 5.4980 - val_acc: 0.4189

Epoch 00024: val_loss did not improve from 4.54463
Epoch 25/30
25000/25000 [==============================] - 144s 6ms/step - loss: 0.9717 - acc: 0.7138 - val_loss: 5.0572 - val_acc: 0.4434

Epoch 00025: val_loss did not improve from 4.54463
Epoch 26/30
25000/25000 [==============================] - 144s 6ms/step - loss: 0.9274 - acc: 0.7213 - val_loss: 6.0407 - val_acc: 0.3975

Epoch 00026: val_loss did not improve from 4.54463
Epoch 27/30
25000/25000 [==============================] - 143s 6ms/step - loss: 0.9017 - acc: 0.7280 - val_loss: 6.4211 - val_acc: 0.3784

Epoch 00027: val_loss did not improve from 4.54463
Epoch 28/30
25000/25000 [==============================] - 143s 6ms/step - loss: 0.8910 - acc: 0.7319 - val_loss: 5.5537 - val_acc: 0.4211

Epoch 00028: val_loss did not improve from 4.54463
Epoch 29/30
25000/25000 [==============================] - 144s 6ms/step - loss: 0.8670 - acc: 0.7372 - val_loss: 5.9796 - val_acc: 0.4114

Epoch 00029: val_loss did not improve from 4.54463
Epoch 30/30
25000/25000 [==============================] - 145s 6ms/step - loss: 0.8316 - acc: 0.7467 - val_loss: 5.5567 - val_acc: 0.4304

Epoch 00030: val_loss did not improve from 4.54463

2 个答案:

答案 0 :(得分:0)

不确定您已经尝试过什么,但是您是否检查过训练和验证样本是否平衡?也就是说,它们在每个类别中是否具有大致相同百分比的示例。

您可以在执行以下代码之前使用“ random.shuffle(samples)”对“ samples”进行混洗:

splitIdx = int(0.75 * len(samples)) train_samples = samples[:splitIdx]

这样,您可以更加确定训练和验证集之间的平衡。

答案 1 :(得分:0)

您可以做很多事情。

  • 在每个conv2d层之后添加批量归一化
  • 用conv2d有效填充替换maxpooling,使其成为可学习的层

来自:pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)

收件人:pool_1 = Conv2D(filters, kernel_size=(1, 1), strides=2, padding='valid')(conv_1)

  • 在您的图层中添加l2 regularization,看一下here进行实施
  • 尝试weight decay
  • 增加您已有的辍学值
  • 修改您的学习率,该值太小,可能会降到局部最小值

还有很多,唯一知道的方法就是尝试一下