执行拼写检查程序时输出错误

时间:2018-12-06 10:49:24

标签: python keras deep-learning lstm recurrent-neural-network

我正在使用此代码进行拼写更正。 python版本是3.6.5。我正在jupyter笔记本中执行此代码。

代码:

import os

import errno

from collections import Counter

from hashlib import sha256

import re

import json

import itertools

import logging

import requests

import numpy as np

import pandas as pd

from numpy.random import choice as random_choice, randint as random_randint, 
shuffle as random_shuffle, seed as random_seed, rand

from numpy import zeros as np_zeros

from keras.models import Sequential, load_model

from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, 
Dropout, recurrent

from keras.callbacks import Callback

LOGGER = logging.getLogger(__name__)

LOGGER.addHandler(logging.StreamHandler())

LOGGER.setLevel(logging.DEBUG)

random_seed(123)

class Configuration(object):

    CONFIG = Configuration()

    CONFIG.input_layers = 2

    CONFIG.output_layers = 2

    CONFIG.amount_of_dropout = 0.2

    CONFIG.hidden_size = 500

    CONFIG.initialization = "he_normal" 

    CONFIG.number_of_chars = 26

    CONFIG.max_input_len = 20

    CONFIG.inverted = True

    CONFIG.batch_size = 4

    CONFIG.epochs = 50

    CONFIG.steps_per_epoch = 10

    CONFIG.validation_steps = 10

    CONFIG.number_of_iterations = 10

dataset=pd.read_csv("inpspell_wordpair2.csv")

input_data=dataset['input'].tolist()

input_data1=str(input_data)

output_data=dataset['output'].tolist()

output_data1=str(output_data)

chars_input = set.union(*(set(input_data1) for inp in input_data1))

chars_output = set.union(*(set(output_data1) for op in output_data1))

chars = list(set.union(chars_input, chars_output))

chars=list(" abcdefghijklmnopqrstuvwxyz")

MIN_INPUT_LEN = 1

AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len

CHARS = list(" abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")

class CharacterTable(object):

    def __init__(self, chars):

         self.chars = sorted(set(chars))

         self.char_indices = dict((c, i) for i, c in enumerate(self.chars))

         self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    @property
    def size(self):

        return len(self.chars)

    def encode(self, C, maxlen):
        X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) # 
        for i, c in enumerate(C):
            X[i, self.char_indices[c]] = 1
    return X

    def decode(self, X, calc_argmax=True):
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in X)

def _vectorize(questions, answers, ctable):

    len_of_questions = len(questions)

    X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int)

    print("inputchars")

    for i in range(len(questions)):
        print(i)
        sentence = questions.pop()
        print(sentence)
        for j, c in enumerate(sentence):
            print(j,c)
            try:
                X[i, j, ctable.char_indices[c]] = 1
            except KeyError:
                pass 

    y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), 
dtype=int)
    print("outputchars")
    for i in range(len(answers)):
        print(i)
        sentence = answers.pop()
        print(sentence)
        for j, c in enumerate(sentence):
            try:
                y[i, j, ctable.char_indices[c]] = 1
            except KeyError:
                pass 
    return X, y

def vectorize(questions, answers, chars=None):

    print('Vectorization...')
    chars = chars or CHARS
    ctable = CharacterTable(chars)
    print("inputdata before _vec")
    print(questions)
    X, y = _vectorize(questions, answers, ctable)

    print(X.shape)
    print(y.shape)

    return  X, y, CONFIG.max_input_len, ctable

def generate_model(output_len, chars=None):

    print('Build model...')
    chars = chars or CHARS
    model = Sequential()

    for layer_number in range(CONFIG.input_layers):
        model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization, return_sequences=layer_number + 1 < CONFIG.input_layers))
        model.add(Dropout(CONFIG.amount_of_dropout))

    model.add(RepeatVector(output_len))

    for _ in range(CONFIG.output_layers):
        model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, 
kernel_initializer=CONFIG.initialization))
        model.add(Dropout(CONFIG.amount_of_dropout))


    model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def iterate_training(model, X_train, y_train, X_val, y_val, ctable):

    for iteration in range(1, CONFIG.number_of_iterations):
        model.fit(X_train, y_train, batch_size=CONFIG.batch_size, 
epochs=CONFIG.epochs, validation_data=(X_val, y_val))

X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data, 
chars)

print ("y_maxlen, chars", y_maxlen, "".join(chars))

model = generate_model(y_maxlen, chars)

iterate_training(model, X_train, y_train, X_train, y_train, ctable)

for inp in X_train:

    inputarray = ctable.decode(inp)

    print(inputarray)

prediction=model.predict_classes(X_train, verbose=0)

for p in prediction:

    guess = ctable.decode(p, calc_argmax=False)

    print(guess)

for op in y_train:

    correct = ctable.decode(op)

    print(correct)

对于上面的代码,我将50个单词样本作为输入。但是执行后,我得到正确的单词,并用单词的最后一个字符填充直到最大输入长度。输出如下所示:

blangggggggggggggggg

accumulatorrrrrrrrrr

plateeeeeeeeeeeeeeeeee

pipeeeeeeeeeeeeeeeeeee

universallllllllllll

abrasiveeeeeeeelllll

wheellllllllllllllll

kittsiveeellllllllll

solidddddddddddddddddd

输入的单词是:

布朗
蓄电池

管道
通用
磨料
车轮
套件
稳定

我该如何克服这个问题。

0 个答案:

没有答案