我正在尝试在this blog之后构建基于LSTM的拼写校正器。在训练过程中效果很好。请参阅所附的屏幕截图。。
我做了一些细微的修改,因此输入的内容是错误的单词,大部分都是专有名词,例如name,并尝试为其提供建议。我仍然正确地进行了文本矢量化,但仍然从LSTM中得到了乱码的输出。这是11个纪元后的model files。
这是我的python模块
import numpy as np
from numpy import zeros as np_zeros
from keras.models import model_from_json
seed = 7
np.random.seed(seed)
class CharacterTable(object):
"""
Given a set of characters:
+ Encode them to a one hot integer representation
+ Decode the one hot integer representation to their character output
+ Decode a vector of probabilities to their character output
"""
def __init__(self, chars):
self.chars = sorted(set(chars))
self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
self.size = len(self.chars)
def encode(self, C, maxlen):
"""Encode as one-hot"""
X = np_zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member
for i, c in enumerate(C):
X[i, self.char_indices[c]] = 1
return X
def decode(self, X, calc_argmax=True):
"""Decode from one-hot"""
if calc_argmax:
X = X.argmax(axis=-1)
return ''.join(self.indices_char[x] for x in X)
def load_saved_model(checkpoint_filename=None,dataset_params_filename=None):
# # serialize model to JSON
# load json and create model
json_file = open('/deepspell/models_gpu_1/model_gpu_1.json','r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
print("Loaded Model")
if checkpoint_filename is not None:
model.load_weights(checkpoint_filename)
print("Loaded Model Weights")
return model
def vectorize(questions):
"""Vectorize the questions list and expected answers list"""
# Samples, Time steps, features
X = np_zeros((len(questions), x_max_length, character_table.size), dtype=np.bool)
for i in range(len(questions)):
sentence = questions[i]
for j, c in enumerate(sentence):
X[i, j, character_table.char_indices[c]] = 1
return X
def run_autocorrect_name(text):
"""
text : text to auto-correct
"""
temp_list = []; temp_list.append(text)
text_vect = vectorize(temp_list)
pred_text = model.predict_classes(text_vect,verbose=0)
suggestion = character_table.decode(pred_text[0], calc_argmax=False)
print("Suggestion for {} : {} ".format(text,suggestion))
return suggestion
CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
# chars_answer : list of lowercase A-Z, dot, dash, space
# chars_question : list of lowercase + uppercase A-Z, dot, dash, space
chars_answer = {'k', '.', 'x', 'l', 'n', 't', 'p', 'm', 'b', 'r', 'e', 'i', 'g', 'y', 'z', 'u', 'f', 'q', 'o', 's', 'd', 'j', ' ', 'v', 'h', 'a', '-', 'c', 'w'}
chars_question = {'I', 'k', '.', 'x', 'l', 'n', 'Q', 't', 'm', 'p', 'b', 'r', 'e', 'i', 'K', 'S', 'g', 'H', 'M', 'y', 'z', 'u', 'W', 'R', 'L', 'U', 'f', 'q', 'X', 'G', 'F', 'Y', 'A', 'T', 'o', 'J', 's', 'Z', 'd', 'j', 'D', 'O', ' ', 'v', 'V', 'C', 'h', 'E', 'N', 'B', 'a', '-', 'P', 'c', 'w'}
chars = sorted(list(set.union(chars_answer, chars_question)))
x_max_length = 40
character_table = CharacterTable(chars)
# load weights
model = '/deepspell/models_gpu_1/weights.11-0.11.hdf5' # 11 epoch
# load the model
model = load_saved_model(checkpoint_filename=model)
# Compile model (required to make predictions)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
test_str = 'Taylor Swift'
run_autocorrect_name(test_str)
上面的输出是
Suggestion for Taylor Swift : anezalrr gilluuhhooooosssaaa...........