我有一个运行良好的神经网络模型,但是我使用的数据非常大,因此尝试使用生成器来运行该模型。我有两个类别变量,首先使用MultiColumnLabelEncoder进行编码,然后使用OnehotEncoder进行编码。在MultiColumnLabelEncoder之后,类别变量的形状为(82914,2),在OnehotEncoder之后为(82914,7668)。我有meta_input = Input(train_X2.shape [1],),希望它在第二次编码后能呈现形状,但是它给了我以下错误:“ ValueError:检查输入时出错:期望meta_input具有形状(2,)但形状为阵列(7668,)”。我之所以使用LabelEncoder然后使用OnehotEncoder的原因是我无法直接使用OnehotEncode。 在第二次编码后,如何使模型考虑形状?
def generator(df,vocab_size,batch_size, tokenizer,input_encoder,onehot):
n_examples = len(df)
number_of_batches = n_examples / batch_size
counter = 0
while 1:
start_index = counter * batch_size
end_index = start_index + batch_size
X_out1 = np.array(([batch_size, n_examples, vocab_size]), dtype=int)
if counter > number_of_batches + 1:
# reshuffle dataframe and start over
df.sample(frac=1).reset_index(drop=True)
counter = 0
counter += 1
X_out1 = tokenizer.texts_to_sequences(df.iloc[start_index: end_index]['var1'])
X_out1 = sequence.pad_sequences(X_out1, maxlen=200)
X_out2 = df.iloc[start_index: end_index][['var2','var3']]
X_out2 = input_encoder.transform(df.iloc[start_index: end_index][[ 'var2','var3']])
X_out2 = onehot.transform(df.iloc[start_index: end_index][[ 'var2','var3']])
Y_out = df.iloc[start_index: end_index]['code']
yield [X_out1, X_out2], [Y_out]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['var1'])
input_encoder = MultiColumnLabelEncoder()
train_X2=df_train[['var2','var3']]
valid_X2 =df_valid[['var2','var3']]
input_encoder.fit(train_X2)
onehot = OneHotEncoder(sparse=False,categories='auto')
onehot.fit(train_X2)
code_type = 'code'
train_labels = df_train[code_type]
valid_labels = df_valid[code_type]
label_encoder = LabelEncoder()
labels = set(df_train[code_type].tolist() + df_valid[code_type].tolist())
label_encoder.fit(list(labels))
n_classes = len(set(labels))
print('n_classes = %s' % n_classes)
input_text = Input(shape=(200,), dtype='int32', name='input_text')
meta_input = Input(shape=(train_X2.shape[1],), name='meta_input')
embedding = Embedding(input_dim=len(tokenizer.word_index) + 1,
output_dim=300,
input_length=200)(input_text)
lstm = Bidirectional(LSTM(units=128,
dropout=0.5,
recurrent_dropout=0.5,
return_sequences=True),
merge_mode='concat')(embedding)
pool = GlobalMaxPooling1D()(lstm)
dropout = Dropout(0.5)(pool)
text_output = Dense(n_codes, activation='sigmoid', name='aux_output')(dropout)
output = concatenate([text_output, meta_input])
output = Dense(n_codes, activation='relu')(output)
main_output = Dense(n_codes, activation='softmax', name='main_output')(output)
model = Model(inputs=[input_text,meta_input], outputs=[output])
optimer = Adam(lr=.001)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# Generators
train_generator = generator(df_train,vocab_size,batch_size, tokenizer,input_encoder,onehot)
validation_generator = generator(df_valid,vocab_size,batch_size, tokenizer,input_encoder,onehot)
model.summary()
model.fit_generator(generator=train_generator,
validation_data=validation_generator,
epochs=20,steps_per_epoch = len(df_train)/batch_size,
validation_steps = len(df_valid)/batch_size, shuffle=True)