RNN和CNN-RNN无法正确训练,总是预测一堂课

时间:2020-03-24 17:07:27

标签: python keras nlp conv-neural-network recurrent-neural-network

我目前正在开发一个模型,以使用深度学习算法从文本中检测情感。我有一个相对较小的标记数据集(〜7500),其中包含7种不同的情感作为类。我开发了CNN,并达到了约63%的准确度,但是当我尝试使用LSTM应用RNN和也使用LSTM应用CNN-RNN时,它们似乎根本无法正确训练,并且总是最终会预测同一个班级。我相信我的模型从根本上讲是合理的,但参数存在一些错误。我将数据集分为85%用于训练,另外20%用于验证,其余15%用于测试。我的嵌入矩阵是使用Google新闻word2vec中的单词表示法开发的,而单词索引是使用keras Tokenizer开发的。

数据集明细:

情感

愤怒1086

令人反感的1074

恐惧1086

罪恶1062

欢乐1089

悲伤1080

羞耻1058

CNN实施

def make_model(kernel_sizes, num_filters, dropout, hidden_units):

    submodels = []
    for kernel_size in kernel_sizes:
        submodel = Sequential()

        submodel.add(Embedding(input_dim = input_dim,
                            output_dim   = output_dim,
                            weights      = [embedding_matrix],
                            input_length = max_len,
                            trainable    = True))

        submodel.add(Conv1D(filters=num_filters, kernel_size=kernel_size, padding='same',activation='relu',strides=1))
        submodel.add(GlobalMaxPooling1D())
        submodels.append(submodel)

    submodel_outputs = [model.output for model in submodels]    
    submodel_inputs = [model.input for model in submodels]

    merged = Concatenate(axis=1)(submodel_outputs)
    x = Dropout(dropout)(merged)

    if(hidden_units > 0):
        x = Dense(hidden_units, activation='relu')(x)
        x = Dropout(dropout)(x)

    x = Dense(7,activation='softmax', kernel_initializer="uniform")(x)
    out = Activation('sigmoid')(x)

    model = Model(submodel_inputs, out)
    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

    return model
def fit_model(model, kernel_sizes, num_epochs, batch_size, x_train, y_train):

    x_train = [x_train]*len(kernel_sizes)

    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_split=0.2)

    return history
kernel_sizes  = [2,6]
num_filters   = 100
dropout       = 0.6
num_hidden    = 270
callbacks     = callbacks_list
num_epochs    = 15
batch_size = 64
model = make_model(kernel_sizes, num_filters, dropout, num_hidden)
print(model.summary())
history = fit_model(model, kernel_sizes, num_epochs, batch_size, x_train, y_train)

型号:“ model_1”


图层(类型)输出形状参数#已连接

embedding_1_input(InputLayer)(无,179)0


embedding_2_input(InputLayer)(无,179)0


embedding_1(嵌入)(无,179,300)2729400 embedding_1_input [0] [0]


embedding_2(嵌入)(无,179,300)2729400 embedding_2_input [0] [0]


conv1d_1(Conv1D)(无,179,100)60100 embedding_1 [0] [0]


conv1d_2(Conv1D)(无,179,100)180100 embedding_2 [0] [0]


global_max_pooling1d_1(GlobalM(None,100)0 conv1d_1 [0] [0]


global_max_pooling1d_2(GlobalM(None,100)0 conv1d_2 [0] [0]


concatenate_1(串联)(无,200)0 global_max_pooling1d_1 [0] [0]
global_max_pooling1d_2 [0] [0]


dropout_1(退出)(无,200)0 concatenate_1 [0] [0]


dense_1(密集)(无,270)54270 dropout_1 [0] [0]


dropout_2(Dropout)(None,270)0 density_1 [0] [0]


dense_2(Dense)(None,7)1897 dropout_2 [0] [0]


activation_1(激活)(无,7)0 density_2 [0] [0]

总参数:5,755,167 可训练的参数:5,755,167 不可训练的参数:0


Training and Validation results for CNN

CNN confusion matrix


RNN实施

def make_model(lstm_units, dropout, hidden_units):

    model = Sequential()   

    model.add(Embedding(input_dim = input_dim,
                        output_dim   = output_dim,
                        weights      = [embedding_matrix],
                        input_length = max_len,
                        trainable    = False))

    model.add(LSTM(lstm_units))

    model.add(Dropout(dropout))

    if(hidden_units > 0):
        model.add(Dense(hidden_units, activation='elu'))
        model.add(Dropout(dropout))

    model.add(Dense(7,activation='softmax', kernel_initializer="uniform"))
    model.add(Activation('sigmoid'))

    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

    return model
lstm_units = 120
dropout = 0.5
hidden_units = 550
callbacks = [tensorboard, early]
num_epochs = 20
batch_size = 60

model = make_model(lstm_units, dropout, hidden_units)
print(model.summary())
history = fit_model(model, num_epochs, batch_size, x_train, y_train)

型号:“ sequential_6”


图层(类型)输出形状参数#

embedding_6(嵌入)(无人,179,300)2729400


lstm_8(LSTM)(无,120)202080


dropout_5(退出)(无,120)0


dense_6(密集)(无,550)66550


dropout_6(退出)(无,550)0


dense_7(密集)(无,7)3857


activation_3(激活)(无,7)0

总参数:3001887 可调参数:272,487 不可训练参数:2,729,400


RNN training and validation scores

RNN confusion matrix


CNN-RNN的实现

def make_model(kernel_sizes, num_filters, dropout, hidden_units, lstm_units):

    submodels = []
    for kernel_size in kernel_sizes:
        submodel = Sequential()

        submodel.add(Embedding(input_dim = input_dim,
                            output_dim   = output_dim,
                            weights      = [embedding_matrix],
                            input_length = max_len,
                            trainable    = True))

        submodel.add(Conv1D(filters=num_filters, kernel_size=kernel_size, padding='same',activation='relu',strides=1))
        submodel.add(MaxPooling1D(pool_size=2, strides = 2))
        submodel.add(Dropout(dropout))
        submodel.add(LSTM(lstm_units)) 
        submodels.append(submodel)

    submodel_outputs = [model.output for model in submodels]    
    submodel_inputs = [model.input for model in submodels]

    merged = Concatenate(axis=1)(submodel_outputs)
    x = Dropout(dropout)(merged)

    if(hidden_units > 0):
        x = Dense(hidden_units, activation='relu')(x)
        x = Dropout(dropout)(x)

    x = Dense(7,activation='softmax', kernel_initializer="uniform")(x)
    out = Activation('sigmoid')(x)

    model = Model(submodel_inputs, out)
    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

    return model
kernel_sizes  = [2,3,6]
num_filters   = 100
dropout       = 0.6
num_hidden    = 270
lstm_units = 80
callbacks     = [tensorboard, early]
num_epochs    = 20
batch_size = 64

model = make_model(kernel_sizes, num_filters, dropout, num_hidden, lstm_units)
print(model.summary())
history = fit_model(model, kernel_sizes, num_epochs, batch_size, x_train, y_train)

型号:“ model_2”


图层(类型)输出形状参数#已连接

embedding_8_input(InputLayer)(无,179)0


embedding_9_input(InputLayer)(无,179)0


embedding_10_input(InputLayer)(无,179)0


embedding_8(嵌入)(无,179,300)2729400 embedding_8_input [0] [0]


embedding_9(嵌入)(无,179,300)2729400 embedding_9_input [0] [0]


embedding_10(嵌入)(无,179,300)2729400 embedding_10_input [0] [0]


conv1d_8(Conv1D)(无,179,100)60100 embedding_8 [0] [0]


conv1d_9(Conv1D)(无,179,100)90100 embedding_9 [0] [0]


conv1d_10(Conv1D)(无,179,100)180100 embedding_10 [0] [0]


max_pooling1d_7(MaxPooling1D)(无,89,100)0 conv1d_8 [0] [0]


max_pooling1d_8(MaxPooling1D)(无,89,100)0 conv1d_9 [0] [0]


max_pooling1d_9(MaxPooling1D)(无,89,100)0 conv1d_10 [0] [0]


dropout_9(退出)(无,89、100)0 max_pooling1d_7 [0] [0]


dropout_10(Dropout)(无,89,100)0 max_pooling1d_8 [0] [0]


dropout_11(Dropout)(无,89,100)0 max_pooling1d_9 [0] [0]


lstm_2(LSTM)(无,80)57920 dropout_9 [0] [0]


lstm_3(LSTM)(无,80)57920 dropout_10 [0] [0]


lstm_4(LSTM)(无,80)57920 dropout_11 [0] [0]


concatenate_3(串联)(无,240)0 lstm_2 [0] [0]
lstm_3 [0] [0]
lstm_4 [0] [0]


dropout_12(Dropout)(无,240)0 concatenate_3 [0] [0]


dense_3(密集)(无,270)65070 dropout_12 [0] [0]


dropout_13(Dropout)(无,270)0 density_3 [0] [0]


dense_4(密集)(无,7)1897 dropout_13 [0] [0]


activation_2(激活)(无,7)0 density_4 [0] [0]

总参数:8,759,227 可训练的参数:8,759,227 不可训练的参数:0


CNN-RNN training and validation scores CNN-RNN confusion matrix

我了解神经网络没有神奇的公式,没有一种适合所有方法的方法,我只是在寻找在实施CNN-RNN和RNN时可能出错的领域的一些指导。

对于任何格式化错误,我们深表歉意,因为这是我的第一个问题。如果需要其他信息,请告诉我。

非常感谢。

2 个答案:

答案 0 :(得分:0)

我不能说这可以解决您的所有问题,但是绝对错误的是您在softmax激活后立即重复使用S形激活,而您的分类问题有7个类。乙状结肠激活只能分开两个类别。

例如:

model.add(Dense(7,activation='softmax', kernel_initializer="uniform"))
model.add(Activation('sigmoid'))

您应该执行三次删除S型激活。

答案 1 :(得分:0)

首先,您的CNN实施过于热情,您是否通过尝试多种设计来提出该体系结构,或者只是选择了它?

通常,当被选择的多个磁头它们供给的输入,而不是完全相同的副本,以便也许你的多头设计是不最优化的选择,它引入了过多不必要的参数,并可能导致过度拟合的稍微变化并从您的损失曲线中可以看出。

https://i.stack.imgur.com/v1GeS.png

您使用了分类交叉熵,但在softmax之后使用了S形,这也不是完成工作的方式。只需使用softmax激活即可摆脱乙状结肠。

测试集的混淆矩阵是吗?然后,看来您的测试拆分太容易了,因为模型过于拟合,因此应该表现不佳。因此,通过确保培训和测试中没有太多相似数据来尝试找到更好的测试划分。

在使用复杂模型之前,最好先对简单模型进行微调。由于您的LSTM模型表现不佳,因此尝试使用更复杂的模型(CNN-LSTM)毫无意义。您的LSTM模型没有收敛,原因可能很多(显而易见的原因是激活层的使用不正确)。

def make_model(lstm_units, dropout, hidden_units):

    model = Sequential()   

    model.add(Embedding(input_dim = input_dim,
                        output_dim   = output_dim,
                        weights      = [embedding_matrix],
                        input_length = max_len,
                        trainable    = False))

    model.add(LSTM(lstm_units, return_sequences = True, recurrent_dropout = 0.2))
    model.add(Dropout(dropout))
    model.add(LSTM(lstm_units, recurrent_dropout = 0.2))

    model.add(Dropout(dropout))


    model.add(Dense(7, activation='softmax'))

    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc'])

    return model

通过摆脱FC层,使其成为完全基于LSTM的模型,还从较小的LSTM单元开始,例如 8、16、32,... >

要获得更多改进,您可以执行以下操作。

0)摆脱手套的嵌入方式,并使用自己可学习的嵌入方式。

1)通过网络进行超参数搜索以找到最佳模型。

有很多库,但是我发现这很灵活。 https://github.com/keras-team/keras-tuner

只需安装pip。

这是演示代码。

from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch


def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Embedding(input_dim=hp.Int('input_dim',
                                        min_value=5000,
                                        max_value=10000,
                                        step = 1000),
                              output_dim=hp.Int('output_dim',
                                        min_value=200,
                                        max_value=800,
                                        step = 100),
                              input_length = 400))
    model.add(layers.Convolution1D(
                filters=hp.Int('filters',
                                        min_value=32,
                                        max_value=512,
                                        step = 32),
                kernel_size=hp.Int('kernel_size',
                                        min_value=3,
                                        max_value=11,
                                        step = 2),
                padding='same',
                activation='relu')),
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling1D())
    model.add(layers.Flatten())
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(units=hp.Int('units',
                                        min_value=64,
                                        max_value=256,
                                        step=32),
                           activation='relu'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(7, activation='softmax'))
    model.compile(
    optimizer=keras.optimizers.Adam(
        hp.Choice('learning_rate',
                  values=[1e-2, 1e-3, 1e-4])),
    loss='categorical_crossentropy',
    metrics=['accuracy'])
    return model


tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='my_dir',
    project_name='helloworld')
tuner.search_space_summary()

## The following lines are based on your model


tuner.search(x, y,
             epochs=5,
             validation_data=(val_x, val_y))

models = tuner.get_best_models(num_models=2)

如果您想提取更多有意义的特征,我发现一种很有前途的方法是提取预先训练的BERT特征,然后使用CNN / LSTM进行训练。

一个很好的存储库是这个- https://github.com/UKPLab/sentence-transformers

一旦您从BERT / XLNet中获得了嵌入句子的功能,就可以使用这些功能来训练与您正在使用的CNN类似的另一CNN,但因为它的价格昂贵,可以摆脱嵌入层。