我想使用预先训练的GLoVe向量,LSTM以及具有S型激活的最终密集层在IMDB数据集上训练简单的情感分类器。
我的问题是,获得的准确性相对较低:78%。这比使用可训练的嵌入层代替GLoVe向量时的精度低82%。
我认为主要原因是因为在GLoVe文件中仅发现数据集中67.9%的单词(我使用的是6B语料库)。
我查看了在GLoVe文件中找不到的一些单词,例如:
祖母的 双胞胎的
基本上,在GLoVe文件中找不到带引号的单词。
我想知道是否需要对数据进行不同的预处理。当前,预处理由函数imdb.load_data()
进行。
我尝试使用较大的42B单词语料库,但覆盖率仅为76.5%。
我想知道是否应该对数据进行不同的标记以得到良好的覆盖范围。
代码是这样的:
load_embeddings.py
from numpy import asarray
import time
def load_embeddings(filename):
start_time = time.time()
embeddings_index = dict()
f = open(filename, encoding = 'utf8')
for line in f:
values = line.split()
word = values[0]
embedding_vector = asarray(values[1:], dtype='float32')
embeddings_index[word] = embedding_vector
f.close()
end_time = time.time()
print('Loaded %s word vectors in %f seconds' % (len(embeddings_index), end_time- start_time))
return embeddings_index
train.py
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from load_embeddings import load_embeddings
maxlen = 80
batch_size = 32
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data()
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
word_to_index = imdb.get_word_index()
vocab_size = len(word_to_index)
print('Vocab size : ', vocab_size)
words_freq_list = []
for (k,v) in imdb.get_word_index().items():
words_freq_list.append((k,v))
sorted_list = sorted(words_freq_list, key=lambda x: x[1])
print("50 most common words: \n")
print(sorted_list[0:50])
# dimensionality of word embeddings
EMBEDDING_DIM = 100
# Glove file
GLOVE_FILENAME = 'data/glove.6B.100d.txt'
# Word from this index are valid words. i.e 3 -> 'the' which is the
# most frequent word
INDEX_FROM = 3
word_to_index = {k:(v+INDEX_FROM-1) for k,v in imdb.get_word_index().items()}
word_to_index["<PAD>"] = 0
word_to_index["<START>"] = 1
word_to_index["<UNK>"] = 2
embeddings_index = load_embeddings(GLOVE_FILENAME)
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size+INDEX_FROM, EMBEDDING_DIM))
# unknown words are mapped to zero vector
embedding_matrix[0] = np.array(EMBEDDING_DIM*[0])
embedding_matrix[1] = np.array(EMBEDDING_DIM*[0])
embedding_matrix[2] = np.array(EMBEDDING_DIM*[0])
for word, i in word_to_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# uncomment below to see which words were not found
# else :
# print(word, ' not found in GLoVe file.')
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
coverage = nonzero_elements / vocab_size
print('Coverage = ',coverage)
# Build and train model
print('Build model...')
model = Sequential()
model.add(Embedding(vocab_size+INDEX_FROM, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False, name= 'embedding'))
model.add(LSTM(EMBEDDING_DIM, dropout=0.2, recurrent_dropout=0.2, name = 'lstm'))
model.add(Dense(1, activation='sigmoid', name='out'))
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=10,
validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
答案 0 :(得分:0)
This可能会有所帮助。你的想法很好。尝试其他预训练向量并不是一个坏主意。有时候他们马上就会变得更好。另外,您可以使用Gensim将条目添加到GloVe或任何一个。