机器学习word2vec嵌入keras文本分类

时间:2019-01-16 21:21:13

标签: python machine-learning keras nlp word2vec

我使用以下句子分类数据集(https://archive.ics.uci.edu/ml/datasets/Sentence+Classification).I提取了词汇表并创建了我保存的word2Vec模型。我尝试在神经网络上输入这个word2Vec模型。 问题是我的准确度非常低(5.76%) 知道为什么会这样吗?

from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelBinarizer

# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix


# load all training reviews

train_docs = x_train
print(train_docs[:5])

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
print(encoded_docs[:5])
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
print("Max_length : ",max_length)
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

encoder = LabelBinarizer()
y_train = encoder.fit_transform(y_train)
#print(transfomed_label)

print(y_train)

# load all test reviews
test_docs = x_test
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
y_test = encoder.fit_transform(y_test)

vocab_size = len(tokenizer.word_index) + 1

# load embedding from file
embedding_word2vec_filename = "UciSentClassif.txt"
raw_embedding = load_embedding(embedding_word2vec_filename)
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=True)

# define model
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(5, activation='tanh'))

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, y_train, epochs=10, verbose=2, batch_size=32)
# evaluate
loss, acc = model.evaluate(Xtest, y_test, verbose=2)
print('\nTest Accuracy: %f' % (acc*100))

0 个答案:

没有答案
相关问题