我正在使用此代码进行拼写更正。 python版本是3.6.5。我正在jupyter笔记本中执行此代码。
代码:
import os
import errno
from collections import Counter
from hashlib import sha256
import re
import json
import itertools
import logging
import requests
import numpy as np
import pandas as pd
from numpy.random import choice as random_choice, randint as random_randint,
shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros
from keras.models import Sequential, load_model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector,
Dropout, recurrent
from keras.callbacks import Callback
LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.StreamHandler())
LOGGER.setLevel(logging.DEBUG)
random_seed(123)
class Configuration(object):
CONFIG = Configuration()
CONFIG.input_layers = 2
CONFIG.output_layers = 2
CONFIG.amount_of_dropout = 0.2
CONFIG.hidden_size = 500
CONFIG.initialization = "he_normal"
CONFIG.number_of_chars = 26
CONFIG.max_input_len = 20
CONFIG.inverted = True
CONFIG.batch_size = 4
CONFIG.epochs = 50
CONFIG.steps_per_epoch = 10
CONFIG.validation_steps = 10
CONFIG.number_of_iterations = 10
dataset=pd.read_csv("inpspell_wordpair2.csv")
input_data=dataset['input'].tolist()
input_data1=str(input_data)
output_data=dataset['output'].tolist()
output_data1=str(output_data)
chars_input = set.union(*(set(input_data1) for inp in input_data1))
chars_output = set.union(*(set(output_data1) for op in output_data1))
chars = list(set.union(chars_input, chars_output))
chars=list(" abcdefghijklmnopqrstuvwxyz")
MIN_INPUT_LEN = 1
AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len
CHARS = list(" abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
class CharacterTable(object):
def __init__(self, chars):
self.chars = sorted(set(chars))
self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
@property
def size(self):
return len(self.chars)
def encode(self, C, maxlen):
X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) #
for i, c in enumerate(C):
X[i, self.char_indices[c]] = 1
return X
def decode(self, X, calc_argmax=True):
if calc_argmax:
X = X.argmax(axis=-1)
return ''.join(self.indices_char[x] for x in X)
def _vectorize(questions, answers, ctable):
len_of_questions = len(questions)
X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int)
print("inputchars")
for i in range(len(questions)):
print(i)
sentence = questions.pop()
print(sentence)
for j, c in enumerate(sentence):
print(j,c)
try:
X[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass
y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),
dtype=int)
print("outputchars")
for i in range(len(answers)):
print(i)
sentence = answers.pop()
print(sentence)
for j, c in enumerate(sentence):
try:
y[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass
return X, y
def vectorize(questions, answers, chars=None):
print('Vectorization...')
chars = chars or CHARS
ctable = CharacterTable(chars)
print("inputdata before _vec")
print(questions)
X, y = _vectorize(questions, answers, ctable)
print(X.shape)
print(y.shape)
return X, y, CONFIG.max_input_len, ctable
def generate_model(output_len, chars=None):
print('Build model...')
chars = chars or CHARS
model = Sequential()
for layer_number in range(CONFIG.input_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization, return_sequences=layer_number + 1 < CONFIG.input_layers))
model.add(Dropout(CONFIG.amount_of_dropout))
model.add(RepeatVector(output_len))
for _ in range(CONFIG.output_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True,
kernel_initializer=CONFIG.initialization))
model.add(Dropout(CONFIG.amount_of_dropout))
model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
for iteration in range(1, CONFIG.number_of_iterations):
model.fit(X_train, y_train, batch_size=CONFIG.batch_size,
epochs=CONFIG.epochs, validation_data=(X_val, y_val))
X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data,
chars)
print ("y_maxlen, chars", y_maxlen, "".join(chars))
model = generate_model(y_maxlen, chars)
iterate_training(model, X_train, y_train, X_train, y_train, ctable)
for inp in X_train:
inputarray = ctable.decode(inp)
print(inputarray)
prediction=model.predict_classes(X_train, verbose=0)
for p in prediction:
guess = ctable.decode(p, calc_argmax=False)
print(guess)
for op in y_train:
correct = ctable.decode(op)
print(correct)
对于上面的代码,我将50个单词样本作为输入。但是执行后,我得到正确的单词,并用单词的最后一个字符填充直到最大输入长度。输出如下所示:
blangggggggggggggggg
accumulatorrrrrrrrrr
plateeeeeeeeeeeeeeeeee
pipeeeeeeeeeeeeeeeeeee
universallllllllllll
abrasiveeeeeeeelllll
wheellllllllllllllll
kittsiveeellllllllll
solidddddddddddddddddd
输入的单词是:
布朗
蓄电池
板
管道
通用
磨料
车轮
套件
稳定
我该如何克服这个问题。