我正在与keras建立聊天机器人。我看到了有关如何构建带有文本模型的上下文聊天机器人的指南。 该模型在一组唯一的单词上进行训练,并学会识别链接到某些单词的类别。 文档列表是带有单词及其相应类别或“标签”的元组列表。
建立模型后,我有一个小脚本,我在其中接收用户输入(句子)并对句子进行分类。 分类功能接收用户的输入(句子)和标记化单词列表,该单词列表与模型在其上训练的单词列表相同。
我的问题是:在加载模型之前,是否需要向模型提供他的训练数据?如果是这样,我该如何克服这个问题并加载模型并对其进行分类,而无需再次加载他的所有训练数据?
def classify(sentence, words):
# generate probabilities for the model
p = bow(sentence, words)
print(p)
d = len(p)
f = len(documents)-2
a = np.zeros([f, d])
tot = np.vstack((p,a))
results = model.predict(tot)[0]
return results
def bow(sentence, words):
# tokenize the pattern
sentence_words = clean_up_sentence(sentence)
# bag of words
print(words)
bag = [0] * len(words)
for s in sentence_words:
for i,w in enumerate(words):
if w == s:
bag[i] = 1
return(np.array(bag))
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
return sentence_words
#This is how I load the data, it is loaded in the same way in the script where the model is built
with open('teste.csv', 'r', encoding='utf-8-sig', newline='') as csv_file:
csvReader = csv.DictReader(csv_file, delimiter = ';', quoting=csv.QUOTE_MINIMAL)
data = list(csvReader)
intents = {}
intents['intents'] = data
words = []
classes = []
documents = []
ignore_words = ['?']
for intent in intents['intents']:
pattern = intent['patterns']
w = nltk.word_tokenize(pattern)
words.extend(w)
documents.append((w,intent['tag']))
if intent['tag'] not in classes:
classes.append(intent['tag'])
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
# remover duplicados na lista
classes = sorted(list(set(classes)))
model = keras.models.load_model ("model_test.h5")
sentence = input()
classify(sentence)