如何将Keras文本生成示例从字符级别更改为字级别?

时间:2016-07-05 16:15:05

标签: python nlp keras lstm language-model

以上代码或多或少是Keras文档为我们提供的语言模型。问题是这种语言模型预测的是字符,而不是单词。严格地说,语言模型应该预测完整的单词。

我的问题是,如何更改此内容以预测完整的字词?

from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

path = "C:/Users/Cedric     Oeldorf/Desktop/University/Research/Data/Gutenberg/MYDATAFINAL3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

from keras.callbacks import History
histLSTM = History()

# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

model.fit(X, y, batch_size=128, nb_epoch=4, callbacks=[histLSTM])

到目前为止,我的数据预处理理念是:

path = "C:/MYDATAFINAL3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))

#tokenize corpus and get list of unique words
tok = gensim.utils.simple_preprocess(text, deacc=False)
words = set(tok)
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

sentences1 = text.split('.')
SYMBOLS = '{}()[].,:;+-*/&|<>=~$'
m = [item.translate(None, SYMBOLS).strip() for item in sentences1]
del text

maxlen = 60
step = 3
sentences = []
next_words = []
for i in range(0, len(tok) - maxlen, step):
    sentences.append(tok[i: i + maxlen])
    next_words.append(tok[i + maxlen])
print('nb sequences:', len(sentences))

X = np.zeros((len(sentences), maxlen), dtype="int32")
y = np.zeros((len(sentences),maxlen), dtype="int32")

这一步没有成功:

#In X, change boolean to true for every listed character, same for y
for i, sentence in enumerate(sentences):
    for t, words in enumerate(sentence):
        X[i, t,] = word_indices[words]
    y[i, t] = word_indices[words]

而且我不知道我应该使用什么输入形状:

print('Build model...')
model = Sequential()
model.add(GRU(512, return_sequences=True, input_shape=(len(sentences), maxlen)))
model.add(Dropout(0.2))
model.add(GRU(512, return_sequences=True))
model.add(Dropout(0.2))
#model.add(Dense(len(chars)))
#Insert this instead:
model.add(TimeDistributedDense(len(words)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.fit(X, y, batch_size=128, nb_epoch=2)

0 个答案:

没有答案