我目前正在研究序列标记器,它使用Label告的句子作为序列。准确性很好,但是预测完全没有用。 (在大多数情况下,仅预测-PAD-,大多数句子仅接受此分类(可以用更高的时期数来固定,但预测仍然不好))。 准确性完全不能反映这一点。也许有人知道为什么。这是代码:
import pandas as pd
import numpy as np
def get_data(filename):
df = pd.read_excel(filename)
# Choese selected classes and substract them from the rest,
# so that the unwanted can be eliminated
selected = ["Classification", "Text", "ID"]
non_selected = list(set(df.columns) - set(selected))
df = df.drop(non_selected, axis=1) # Drop non selected columns
df = df.dropna(axis=0, how='any', subset=selected) # Drop null rows
x_raw = df[selected[1]].apply(lambda x: x).tolist()
x_ids = df[selected[2]].apply(lambda z: z).tolist()
y_raw = df[selected[0]].apply(lambda y: y).tolist()
sentences = [s.split(" ") for s in x_raw]
sentence_tags = y_raw
return sentences, sentence_tags, x_ids
train_sentences, train_tags, train_ids = get_data("./data/training_data.xlsx")
test_sentences, test_tags, test_ids = get_data("./data/test_data.xlsx")
words, tags = set([]), set([])
for s in train_sentences:
for w in s:
words.add(w.lower())
for t in train_tags:
tags.add(t)
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0 # The special value used for padding
word2index['-OOV-'] = 1 # The special value used for OOVs (Out of Vocabulary)
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0 # The special value used to padding
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
for s in train_sentences:
s_int = []
for w in s:
try:
s_int.append(word2index[w.lower()])
except KeyError:
s_int.append(word2index['-OOV-'])
train_sentences_X.append(s_int)
for s in test_sentences:
s_int = []
for w in s:
try:
s_int.append(word2index[w.lower()])
except KeyError:
s_int.append(word2index['-OOV-'])
test_sentences_X.append(s_int)
for t in train_tags:
train_tags_y.append(tag2index[t])
for t in test_tags:
test_tags_y.append(tag2index[t])
def getEmbeddings(x, y, id):
# Create average sentence embedding
max_len = len(max(x, key=len))
x = [sum(a) / max_len for a in x]
# Put sequential sentences in the same sublist
x_emb = []
y_emb = []
x_temp = [x[0]]
old_id = id[0]
y_temp =[y[0]]
for i, id in enumerate(id):
if i != 0:
if old_id == id:
x_temp.append(x[i])
y_temp.append(y[i])
else:
x_emb.append(x_temp)
y_emb.append(y_temp)
x_temp = [x[i]]
y_temp = [y[i]]
old_id = id
return x_emb, y_emb
train_sentences_X, train_tags_y = getEmbeddings(train_sentences_X, train_tags_y, train_ids)
test_sentences_X, test_tags_y = getEmbeddings(test_sentences_X, test_tags_y, test_ids)
# PADDING
MAX_LENGTH = len(max(train_sentences_X, key=len))
from keras.preprocessing.sequence import pad_sequences
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])
from keras import backend as K
def ignore_class_accuracy(to_ignore=0):
def ignore_accuracy(y_true, y_pred):
y_true_class = K.argmax(y_true, axis=-1)
y_pred_class = K.argmax(y_pred, axis=-1)
ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
return accuracy
return ignore_accuracy
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH,)))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=Adam(0.001),
metrics=['accuracy', ignore_class_accuracy(0)])
# Prints most important information of the model
model.summary()
def to_categorical(sequences, categories):
cat_sequences = []
for s in sequences:
cats = []
for item in s:
cats.append(np.zeros(categories))
cats[-1][item] = 1.0
cat_sequences.append(cats)
return np.array(cat_sequences)
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
# Training!
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=3, validation_split=0.2)
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}") # acc: 86.80813234928902
predictions = model.predict_classes(test_sentences_X)
index2tag = {i: t for t, i in tag2index.items()}
predict = [[index2tag[s] for s in p] for p in predictions]
labels = [[index2tag[s] for s in p] for p in test_tags_y]
k = 0
for i in range(len(test_tags_y)):
for j in range(len(labels[0])):
if labels[i][j] != '-PAD-':
print('Tag: {}, Prediction: {}, Text: {}'.format(labels[i][j], predict[i][j], " ".join(test_sentences[k])))
k += 1
测试/培训文件具有以下格式:
Classification ID Text
pi d1 Henry Example 80, died from complications following surgery on ...
有8个类别+ -PAD-类别!
编辑:我自己计算了精度,以epochs = 200计算,精度为0.4359447004608295。因此它与model.evaluate不同!
编辑:我尝试了一些没有成功的事情。我将嵌入的'mask_zero'设置为True,并删除了ignore_class_accuracy方法,或者将Loss = binary_crossentropy更改为Sigmoid,但是如上所述,没有任何改进。测试数据的准确性永远不会超过48%(不是在训练期间)。