Question

我正在尝试基于keras ocr示例link来实现手写ocr。但是我收到以下错误：

InvalidArgumentError: All labels must be nonnegative integers, batch: 0 labels: 1,0,11,9,45,0,25,17,27,41,39,9,37,0,23,1,39,9,35,0,11,35,29,25,0,1,0,27,9,1,35,3,49,0,43,17,23,23,1,13,9,0,69,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
     [[{{node ctc_6/CTCLoss}}]]
     [[{{node training_5/SGD/gradients/ctc_6/CTCLoss_grad/mul}}]]

以下是生成器，ctc和火车函数：

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

#Generation of data: load the images, resize, gray, normalize them 
class DataGenerator(keras.utils.Sequence):
    def __init__(self, list_Files, labels,downsample_factor, max_string_length=80, batch_size=32, dim=(512,64), shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_Files = list_Files
        self.shuffle = shuffle
        self.on_epoch_end()
        self.max_string_length = max_string_length
        self.downsample_factor = downsample_factor       

    #TODO: Add weight save
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_Files))
        if self.shuffle==True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_Files_temp):
        #*[2,2] --> 2,2 (unpack values)
        X = np.ones([self.batch_size, *self.dim,1]) 
        y = np.ones([self.batch_size, self.max_string_length])*-1 #As in the keras_ocr example why -1?
        X_length = np.zeros([self.batch_size,1])
        y_length = np.zeros([self.batch_size,1])

        #TODO: add mix with blank inputs as it is said to be important for transitional invariance


        for i, file in enumerate(list_Files_temp):
            im = cv2.imread(file)# load the file as numpy array
            im = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY) #Transform the file into a Gray image
            im = cv2.resize(im, self.dim[::-1]) #Resize it (cv2 takes width first)
            im = im / 255 #Normalization

            X[i,0:self.dim[0],:,0] = im
            X_length[i] = self.dim[0] // self.downsample_factor -2 #?????

            seq = text_to_labels(self.labels[file])            
            y[i,0:len(seq)] = text_to_labels(self.labels[file]) #Transform the text into a list of integers

            y_length[i] = len(y[i])
            print("LEN={0}".format(y_length[i]))


        inputs={'the_input': X,
                'the_labels': y,
                'input_length':X_length,
                'label_length':y_length
                }
        outputs = {'ctc': np.zeros([self.batch_size])}


        print(y)


        return (inputs, outputs)

    def __len__(self):
        'Number of batches per epoch'
        return int(np.floor(len(self.list_Files) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        list_Files_temp = [self.list_Files[k] for k in indexes]
        #print(list_Files_temp[0])
        (inputs, outputs) = self.__data_generation(list_Files_temp)

        return (inputs, outputs)

def train(dim_images,partition,labels):

    #Misc parameters
    absolute_max_string_length = 80 
    output_size = len(alphabet) + 1 #+1 for the CTC blank symbol

    #Network parameters
    img_h = dim_images[0]
    img_w = dim_images[1]
    conv_filters = 16
    kernel_size = (3,3)
    pool_size = 2
    time_dense_size = 32
    rnn_size = 512
    act = 'relu'
    input_shape = (*DIM_IMAGES,1)
    downsample_factor = pool_size**2

    #Convolutional layer
    input_data = Input(name='the_input', shape=input_shape)
    inner = Conv2D(conv_filters, kernel_size, padding='same', 
                   activation=act, kernel_initializer='he_normal', name='conv1')(input_data)
    inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(inner)
    inner = Conv2D(conv_filters, kernel_size, padding='same',
               activation=act, kernel_initializer='he_normal',
               name='conv2')(inner)
    inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner)


    conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters)
    inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)

    #Recurrent layer
    gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
    gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(inner)
    gru1_merged = add([gru_1, gru_1b])
    gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
    gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(gru1_merged)

    # transforms RNN output to character activations:
    inner = Dense(output_size, kernel_initializer='he_normal',
              name='dense2')(concatenate([gru_2, gru_2b]))

    #Prediction (need to be decoded)
    y_pred = Activation('softmax', name='softmax')(inner)

    Model(inputs=input_data, outputs=y_pred).summary()

    labelsI = Input(name='the_labels',
                   shape =[absolute_max_string_length], dtype='float32')
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(
        ctc_lambda_func, output_shape=(1,),
        name='ctc')([y_pred, labelsI, input_length, label_length])


    #Genrators 
    training_generator = DataGenerator(partition['train'],labels,downsample_factor, batch_size=BATCH_SIZE, dim=DIM_IMAGES, shuffle=True)
    valid_generator = DataGenerator(partition['valid'], labels,downsample_factor, batch_size=BATCH_SIZE, dim=DIM_IMAGES, shuffle=False)

    # clipnorm seems to speeds up convergence
    sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)

    model = Model(inputs=[input_data, labelsI, input_length, label_length],
                  outputs=loss_out)

    # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)

    # captures output of softmax so we can decode the output during visualization
    test_func = K.function([input_data], [y_pred])


    model.fit_generator(
        generator=training_generator,
        steps_per_epoch=(len(partition['train'])-len(partition['valid'])) // BATCH_SIZE,
        epochs=20,
        validation_data=valid_generator,
        validation_steps=len(partition['valid'])//BATCH_SIZE)

我猜'-1'标签来自此行：

y = np.ones([self.batch_size, self.max_string_length])*-1

在原始代码中，有类似的行（第220行），但运行良好：

self.Y_data = np.ones([self.num_words, self.absolute_max_string_len]) * -1

我以为'-1'是填充序列的一种方式，但是ctc函数似乎禁止使用此值，我在这里缺少什么吗？

Answer 1

似乎我只是在混合图像长度和图像宽度。另外，“ label_length”应等于句子的实际长度（在填充-1之前）。因此这行：

y_length[i] = len(y[i])

应替换为：

y_length[i] = len(seq)

带负号的负标签OCR示例

1 个答案: