以上代码或多或少是Keras文档为我们提供的语言模型。问题是这种语言模型预测的是字符,而不是单词。严格地说,语言模型应该预测完整的单词。
我的问题是,如何更改此内容以预测完整的字词?
from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
path = "C:/Users/Cedric Oeldorf/Desktop/University/Research/Data/Gutenberg/MYDATAFINAL3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))
chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
from keras.callbacks import History
histLSTM = History()
# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.fit(X, y, batch_size=128, nb_epoch=4, callbacks=[histLSTM])
到目前为止,我的数据预处理理念是:
path = "C:/MYDATAFINAL3.txt"
text = open(path).read().lower()
print('corpus length:', len(text))
#tokenize corpus and get list of unique words
tok = gensim.utils.simple_preprocess(text, deacc=False)
words = set(tok)
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))
sentences1 = text.split('.')
SYMBOLS = '{}()[].,:;+-*/&|<>=~$'
m = [item.translate(None, SYMBOLS).strip() for item in sentences1]
del text
maxlen = 60
step = 3
sentences = []
next_words = []
for i in range(0, len(tok) - maxlen, step):
sentences.append(tok[i: i + maxlen])
next_words.append(tok[i + maxlen])
print('nb sequences:', len(sentences))
X = np.zeros((len(sentences), maxlen), dtype="int32")
y = np.zeros((len(sentences),maxlen), dtype="int32")
这一步没有成功:
#In X, change boolean to true for every listed character, same for y
for i, sentence in enumerate(sentences):
for t, words in enumerate(sentence):
X[i, t,] = word_indices[words]
y[i, t] = word_indices[words]
而且我不知道我应该使用什么输入形状:
print('Build model...')
model = Sequential()
model.add(GRU(512, return_sequences=True, input_shape=(len(sentences), maxlen)))
model.add(Dropout(0.2))
model.add(GRU(512, return_sequences=True))
model.add(Dropout(0.2))
#model.add(Dense(len(chars)))
#Insert this instead:
model.add(TimeDistributedDense(len(words)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.fit(X, y, batch_size=128, nb_epoch=2)