我正在尝试使用CTC损失在keras中实现百度的DeepSpeech1,我的代码如下:
class dataGen(Sequence): # data generator for Mozilla common voice
def __init__(self, audiopaths, transcripts, batch_size):
self.x = audiopaths
self.y = transcripts
self.batch_size = batch_size
def __len__(self):
return int(len(self.x) / self.batch_size)
def __getitem__(self, idx):
batch_x = self.x[idx*self.batch_size : (idx+1)*self.batch_size]
batch_y = self.y[idx*self.batch_size : (idx+1)*self.batch_size]
x_val = [get_max_time(file_name) for file_name in batch_x]
max_val = max(x_val)
x_data = np.array([make_mfcc_shape(file_name, padlen=max_val) for file_name in batch_x]) # just converts data to mdcc
y_val = [get_maxseq_len(l) for l in batch_y]
max_y = max(y_val)
labels = np.array([get_intseq(l, max_intseq_length=max_y) for l in batch_y])
input_length = np.array(x_val)
label_length = np.array(y_val)
return [x_data, labels, input_length, label_length], np.zeros((self.batch_size,)), [None]
def on_epoch_end(self):
i = np.arange(len(self.x))
np.random.shuffle(i)
self.x = self.x[i]
self.y = self.y[I]
def clipped_relu(x):
return relu(x, max_value=20)
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
return ctc_batch_cost(labels, y_pred, input_length, label_length)
def ctc(y_true, y_pred):
return y_pred
input_data = Input(name='the_input', shape=(None, 26))
inner = TimeDistributed(Dense(2048))(input_data)
inner = TimeDistributed(Activation(clipped_relu))(inner)
inner = TimeDistributed(Dropout(0.1))(inner)
inner = TimeDistributed(Dense(2048))(inner)
inner = TimeDistributed(Activation(clipped_relu))(inner)
inner = TimeDistributed(Dropout(0.1))(inner)
inner = TimeDistributed(Dense(2048))(inner)
inner = TimeDistributed(Activation(clipped_relu))(inner)
inner = TimeDistributed(Dropout(0.1))(inner)
inner = Bidirectional(LSTM(2048, return_sequences=True))(inner)
inner = TimeDistributed(Activation(clipped_relu))(inner)
inner = TimeDistributed(Dropout(0.1))(inner)
output = TimeDistributed(Dense(28, activation="softmax"))(inner)
labels = Input(name='the_labels', shape=[None,])
input_length = Input(name='input_length', shape=[1])
label_length = Input(name='label_length', shape=[1])
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc'). ([output, labels, input_length, label_length])
model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
model.compile(optimizer='adam', loss=ctc)
这都是非常标准的,但是在训练过程中,我的模型通常会损失100到200之间(从> 1000开始),然后停止改进,而当我对其进行测试(删除lambda层以获取成绩单输出)时,它只会输出空白字符。
我的理论是,它训练仅输出空白字符,因为与随机字符相比,这种方法所造成的损失要小一些,但会卡在该位置的局部最小值上,而实际上并没有学会转录音频。
有人知道如何解决此问题吗?