Question

我正在研究 Word2Vec。这里我对8位作者写的文本进行分类。我使用Word2Vec对文本进行分类。在训练我的模型时，我在每个时期获得 nan 作为损失和 0.0000e+00 作为准确度。我使用了 10 个时期。我正在使用 10 本小说类书籍进行训练，2 本用于测试每个作者。由于我是机器学习的新手，我无法找出错误。词汇构建是第一阶段，接下来是 word2vec 训练。这2个效果很好。但是这个错误发生在下一个级别，即使用预训练的 word2Vec 时。

from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('t'):
            continue
        if not is_trian and not filename.startswith('t'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# load all training reviews
a1 = process_docs('txt/author1', vocab, True)
a2 = process_docs('txt/author2', vocab, True)
a3 = process_docs('txt/author3', vocab, True)
a4 = process_docs('txt/author4', vocab, True)
a5 = process_docs('txt/author5', vocab, True)
a6 = process_docs('txt/author6', vocab, True)
a7 = process_docs('txt/author7', vocab, True)
a8 = process_docs('txt/author8', vocab, True)
a9 = process_docs('txt/author9', vocab, True)
train_docs = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 +a9


# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(10)] + [1 for _ in range(10)] + [2 for _ in range(10)] + [3 for _ in 
range(10)] + [4 for _ in range(10)] + [5 for _ in range(10)] + [6 for _ in range(10)] + [7 for _ in 
range(10)] + [8 for _ in range(10)])

# load all test reviews
a1 = process_docs('txt/author1', vocab, False)
a2 = process_docs('txt/author2', vocab, False)
a3 = process_docs('txt/author3', vocab, False)
a4 = process_docs('txt/author4', vocab, False)
a5 = process_docs('txt/author5', vocab, False)
a6 = process_docs('txt/author6', vocab, False)
a7 = process_docs('txt/author7', vocab, False)
a8 = process_docs('txt/author8', vocab, False)
a9 = process_docs('txt/author9', vocab, False)
test_docs = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 +a9

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(2)] + [1 for _ in range(2)] + [2 for _ in range(2)] + [3 for _ in 
range(2)] + [4 for _ in range(2)] + [5 for _ in range(2)] + [6 for _ in range(2)] + [7 for _ in 
range(2)] +[8 for _ in range(2)])

# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, 
trainable=False)

# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

在 Word2Vec 中，我的损失是“nan”，准确度是“0.0000e+00”

0 个答案: