我一直在尝试创建使用Keras使用GloVe嵌入的多标签文本分类器。我目前正在实验http://cogcomp.org/Data/QA/QC/上可用的TREC-6数据集。我只考虑5个广泛的标签来解决分类问题,而忽略了子标签。
由于这是一个多标签分类问题,因此给定一个句子,我的神经网络应输出概率大于0.1的所有标签。问题是网络几乎总是用一个标签对一个问题进行分类,这在我问一个仅属于一个类别的问题时很好。但是,当我合并不同类别的问题时,尽管我希望所有相关标签都可以被识别,但大多数时候它仍只给出一个高置信度的标签。 我绝对确定预处理步骤是正确的。我感觉到我的模型存在问题。
一开始,我通过参考https://www.aclweb.org/anthology/D14-1181.pdf上的论文《用于句子分类的卷积神经网络》开始仅使用CNN进行实验,但是在老师的建议下,并且在看到它们因涉及不同主题的长问题而失败后,我才开始进行实验,我尝试使用LSTM和BiLSTMS进行实验。我从这种方法https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/80568开始,并不断修改一些参数/添加和删除希望获得良好结果的图层,但是到目前为止,我还是失败了。 我尝试复制粘贴一些用于Attention机制的代码,并将其添加到我的LSTM层之后,但这无济于事。
我当前的模型看起来像这样。为了清楚起见,我将粘贴其余大部分代码。该模型和训练代码包含在句子_classifier()函数中。
class SentenceClassifier:
def __init__(self):
self.MAX_SEQUENCE_LENGTH = 200
self.EMBEDDING_DIM = 100
self.LABEL_COUNT = 0
self.WORD_INDEX = dict()
self.LABEL_ENCODER = None
def clean_str(self, string):
"""
Cleans each string and convert to lower case.
"""
string = re.sub(r"\'s", "", string)
string = re.sub(r"\'ve", "", string)
string = re.sub(r"n\'t", " not", string)
string = re.sub(r"\'re", "", string)
string = re.sub(r"\'d", "", string)
string = re.sub(r"\'ll", "", string)
string = re.sub(r"[^A-Za-z0-9]", " ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def loader_encoder(self, table, type="json"):
"""
Load and encode data from dataset.
type = "sql" means get data from MySQL database.
type = "json" means get data from .json file. """
if type == "json":
with open('data/' + table + '.json', 'r', encoding='utf8') as f:
datastore = json.load(f)
questions = []
tags = []
for row in datastore:
questions.append(row['question'])
tags.append(row['tags'].split(','))
tokenizer = Tokenizer(lower=True, char_level=False)
tokenizer.fit_on_texts(questions)
self.WORD_INDEX = tokenizer.word_index
questions_encoded = tokenizer.texts_to_sequences(questions)
questions_encoded_padded = pad_sequences(questions_encoded, maxlen=self.MAX_SEQUENCE_LENGTH, padding='post')
for i, ele in enumerate(tags):
for j, tag in enumerate(ele):
if len(tag) == 0 or tag == ',':
del tags[i][j]
encoder = MultiLabelBinarizer()
encoder.fit(tags)
self.LABEL_ENCODER = encoder
tags_encoded = encoder.fit_transform(tags)
self.LABEL_COUNT = len(tags_encoded[0]) #No. of labels
print("\tUnique Tokens in Training Data: ", len(self.WORD_INDEX))
return questions_encoded_padded, tags_encoded
def load_embeddings(self, EMBED_PATH='./embeddings/glove.6B.100d.txt'):
"""
Load pre-trained embeddings into memory.
"""
embeddings_index = {}
try:
f = open(EMBED_PATH, encoding='utf-8')
except FileNotFoundError:
print("Embeddings missing.")
sys.exit()
for line in f:
values = line.rstrip().rsplit(' ')
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = vec
f.close()
print("\tNumber of tokens in embeddings file: ", len(embeddings_index))
return embeddings_index
def create_embedding_matrix(self, embeddings_index):
"""
Creates an embedding matrix for all the words(vocab) in the training data with shape (vocab, EMBEDDING_DIM).
Out-of-vocab words will be randomly initialized to values between +0.25 and -0.25.
"""
words_not_found = []
vocab = len(self.WORD_INDEX) + 1
embedding_matrix = np.random.uniform(-0.25, 0.25, size=(vocab, self.EMBEDDING_DIM))
for word, i in self.WORD_INDEX.items():
if i >= vocab:
continue
embedding_vector = embeddings_index.get(word)
if (embedding_vector is not None) and len(embedding_vector) > 0:
embedding_matrix[i] = embedding_vector
else:
words_not_found.append(word)
# print('Number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
print("\tShape of embedding matrix: ", str(embedding_matrix.shape))
print("\tNo. of words not found in pre-trained embeddings: ", len(words_not_found))
return embedding_matrix
def sentence_classifier_cnn(self, embedding_matrix, x, y, table, load_saved=0):
"""
A static CNN model.
Makes uses of Keras functional API for constructing the model.
If load_saved=1, THEN load old model, ELSE train new model
model_name = table + ".model.h5"
if load_saved == 1 and os.path.exists('./saved/' + model_name):
print("\nLoading saved model...")
model = load_model('./saved/' + model_name)
print("Model Summary")
print(model.summary())
"""
print("\nTraining model...")
inputs = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding = Embedding(input_dim=(len(self.WORD_INDEX) + 1), output_dim=self.EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=self.MAX_SEQUENCE_LENGTH)(inputs)
X = keras.layers.SpatialDropout1D(0.3)(embedding)
X = keras.layers.Bidirectional(keras.layers.CuDNNLSTM(64, return_sequences = True))(X)
#X2 = keras.layers.Bidirectional(keras.layers.CuDNNGRU(128, return_sequences = False))(X)
X = keras.layers.Conv1D(32, kernel_size=2, padding='valid', kernel_initializer='normal')(X)
X = keras.layers.GlobalMaxPooling1D()(X)
#X = Attention(self.MAX_SEQUENCE_LENGTH)(X)
X = Dropout(0.5)(X)
X = keras.layers.Dense(16, activation="relu")(X)
X = Dropout(0.5)(X)
X = keras.layers.BatchNormalization()(X)
output = Dense(units=self.LABEL_COUNT, activation='sigmoid')(X)
model = Model(inputs=inputs, outputs=output, name='intent_classifier')
print("Model Summary")
print(model.summary())
cbk = OutputObserver(model, classifier)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x, y,
batch_size=30,
epochs=23,
verbose=2,
callbacks = [cbk])
#keras.utils.vis_utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
return model
def tag_question(self, model, question):
question = self.clean_str(question)
question_encoded = [[self.WORD_INDEX[w] for w in question.split(' ') if w in self.WORD_INDEX]]
question_encoded_padded = pad_sequences(question_encoded, maxlen=self.MAX_SEQUENCE_LENGTH, padding='post')
predictions = model.predict(question_encoded_padded)
possible_tags = []
for i, probability in enumerate(predictions[0]):
if probability >= 0.01:
possible_tags.append([self.LABEL_ENCODER.classes_[i], probability])
possible_tags.sort(reverse=True, key=lambda x:x[1]) #sort in place on the basis of the probability in each sub-list in descending order
print(possible_tags)
def setup_classifier(self, table):
'''
'''
print("Loading Data Set...")
x, y = self.loader_encoder(table)
embeddings_index = self.load_embeddings()
print("\nGenerating embedding matrix...")
embedding_matrix = self.create_embedding_matrix(embeddings_index)
#Loading / Training model
model = self.sentence_classifier_cnn(embedding_matrix, x, y, table, load_saved=1)
return model, embeddings_index
def connect_to_db(self):
mydb = mysql.connector.connect(host="localhost", user="root", passwd="root", database="questiondb")
cursor = mydb.cursor()
return mydb, cursor
如您所见,我已经使用回调在每个步骤之后打印我的预测。 我曾尝试预测各种问题的标签,但仅对属于一个类别的问题才能获得良好的结果。
例如,
classifier.tag_question(model, "how many days before new year?")
给予
[['numeric', 0.99226487]]
作为输出。但是更复杂的问题,例如
classifier.tag_question(model, "who is the prophet of the muslim people and where is india located and how much do fruits costs there?")
给出类似
[['human', 0.9990531]]
作为输出,尽管诸如“位置”和“数字”之类的标签也相关。 在每个时期之后,我使用回调函数来预测该问题的预测,然后我会看到类似的东西。
Epoch 1/23
- 19s - loss: 0.6581 - acc: 0.6365
[['human', 0.69752634], ['location', 0.40014982], ['entity', 0.32047516], ['abbreviation', 0.23877779], ['numeric', 0.23324837], ['description', 0.15995058]]
Epoch 2/23
- 12s - loss: 0.4525 - acc: 0.8264
[['human', 0.7437608], ['location', 0.18141672], ['entity', 0.14474556], ['numeric', 0.09171515], ['description', 0.053900182], ['abbreviation', 0.05283475]]
Epoch 3/23
- 12s - loss: 0.3854 - acc: 0.8478
[['human', 0.86335427], ['location', 0.12673976], ['entity', 0.09847507], ['numeric', 0.064431995], ['description', 0.035599917], ['abbreviation', 0.02441895]]
Epoch 4/23
- 12s - loss: 0.3634 - acc: 0.8509
[['human', 0.90795004], ['location', 0.10085008], ['entity', 0.09804481], ['numeric', 0.050411616], ['description', 0.032810867], ['abbreviation', 0.014970899]]
Epoch 5/23
- 13s - loss: 0.3356 - acc: 0.8582
[['human', 0.8365586], ['entity', 0.1130701], ['location', 0.10253032], ['numeric', 0.039931685], ['description', 0.02874279]]
Epoch 6/23
- 13s - loss: 0.3142 - acc: 0.8657
[['human', 0.95577633], ['entity', 0.088555306], ['location', 0.055004593], ['numeric', 0.015950901], ['description', 0.01428318]]
Epoch 7/23
- 13s - loss: 0.2942 - acc: 0.8750
[['human', 0.89538944], ['entity', 0.130977], ['location', 0.06350105], ['description', 0.023014158], ['numeric', 0.019377537]]
Epoch 8/23
- 13s - loss: 0.2739 - acc: 0.8802
[['human', 0.9725125], ['entity', 0.061141968], ['location', 0.026945814], ['description', 0.010931551]]
Epoch 9/23
- 13s - loss: 0.2579 - acc: 0.8914
[['human', 0.9797143], ['entity', 0.042518377], ['location', 0.027904237]]
Epoch 10/23
- 13s - loss: 0.2380 - acc: 0.9020
[['human', 0.7897601], ['entity', 0.14315197], ['location', 0.07439863], ['description', 0.019453615], ['numeric', 0.010681627]]
Epoch 11/23
- 13s - loss: 0.2250 - acc: 0.9104
[['human', 0.9886158], ['entity', 0.024878502], ['location', 0.015951043]]
Epoch 12/23
- 13s - loss: 0.2131 - acc: 0.9178
[['human', 0.9677731], ['entity', 0.03698206], ['location', 0.026153017]]
Epoch 13/23
- 13s - loss: 0.2029 - acc: 0.9204
[['human', 0.9514474], ['entity', 0.053581357], ['location', 0.029657435]]
Epoch 14/23
- 13s - loss: 0.1915 - acc: 0.9285
[['human', 0.9706739], ['entity', 0.0328649], ['location', 0.013876333]]
Epoch 15/23
- 13s - loss: 0.1856 - acc: 0.9300
[['human', 0.9328136], ['location', 0.05573874], ['entity', 0.025918543]]
Epoch 16/23
- 13s - loss: 0.1802 - acc: 0.9318
[['human', 0.9895527], ['entity', 0.014941782], ['location', 0.011972391]]
Epoch 17/23
- 13s - loss: 0.1717 - acc: 0.9373
[['human', 0.9426272], ['entity', 0.03754583], ['location', 0.023379702]]
Epoch 18/23
- 13s - loss: 0.1614 - acc: 0.9406
[['human', 0.99186605]]
Epoch 19/23
- 13s - loss: 0.1573 - acc: 0.9432
[['human', 0.9926062]]
Epoch 20/23
- 13s - loss: 0.1511 - acc: 0.9448
[['human', 0.9993554]]
Epoch 21/23
- 13s - loss: 0.1591 - acc: 0.9426
[['human', 0.9964465]]
Epoch 22/23
- 13s - loss: 0.1507 - acc: 0.9451
[['human', 0.999688]]
Epoch 23/23
- 13s - loss: 0.1524 - acc: 0.9436
[['human', 0.9990531]]
我尝试改变参数数百次,尤其是我的网络大小,批处理大小和时期,以尝试避免过度拟合。
我知道我的问题很长,但是我已经没有耐心了,任何帮助将不胜感激。
这是我的colab笔记本的链接-https://colab.research.google.com/drive/1EOklUw7efOv69HvWKpuKVy1LSzcvTTCk。