Question

我有一个很大的数据集，当将其标记并转换为一种热编码矢量（LSTM模型的输入格式）时，在尝试转换为一种热编码时会抛出内存错误

我试图使用更高的处理能力和内存复杂的GPU机器，但似乎没有任何效果。但是，我阅读了有关使用Fit Generator的文章。有人可以帮我如何在下面的代码中实现健身生成器吗？

# Directory structure for dijkstra
data_dir = '/home/rjagannath1/data/mimic-data/txt'
save_dir = '/home/rjagannath1/save'

seq_length = 30 # sequence length
sequences_step = 1 #step to create sequences


from os.path import join

# for gpu
file_name = 'mimic-data-file-processed.txt'
vocab_file = join(save_dir, "words_vocab.pkl")

num_words = 0
with open(data_dir+'/'+ file_name, 'r') as f:
# with open(data_dir+'/'+txt, 'r', encoding="utf8") as f:
    for line in f:
        words = line.split()
        num_words += len(words)
print("Number of words:")
print(num_words)

# read data
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

# create list of sentences
wordlist = []
input_file = os.path.join(data_dir, file_name)
#read data
with codecs.open(input_file, "r", encoding="utf8") as f:
    data = f.read()

#create sentences
wl = nltk.word_tokenize(data) # using NLTK
wordlist = wordlist + wl 

# create a dictionary

# count the number of words
word_counts = collections.Counter(wordlist)

# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)
print(words)

#save the words and vocabulary
with open(os.path.join(vocab_file), 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

# create sequences
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))   

# training
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

# Build Model
# =============================================================================
def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))

    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    return model

rnn_size = 256 # size of RNN
batch_size = 32 # minibatch size
seq_length = 30 # sequence length
num_epochs = 20 # number of epochs
learning_rate = 0.001 #learning rate
sequences_step = 1 #step to create sequences

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

#fit the model
callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences_lstm.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.01)

# save the model
md.save(save_dir + "/" + 'my_model_gen_sentences_lstm.final.hdf5')

我在这里进行一次热编码时遇到内存错误-

# training
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)

如何使用Fit生成器克服系统内存问题？

适合生成器以将输入转换为LSTM模型的批次

0 个答案: