我正在学习Tensorflow和双向LSTM网络。 我训练了一个模型(下面的代码)来学习文本中的下一个字符。
我生成一个由a,b和c字符组成的虚拟序列,具有不同的转换概率。
LSTM的训练似乎工作正常并且非常快速地收敛(它实际上是正常的还是它实际上会给我一个红旗?我猜这是正常的,因为我的文本非常简短且复杂性很简单)。 / p>
我的问题是当我重用训练有素的网络时。 我关闭火车会话,重置图表并使用Tensorflow的保护程序加载它。
当我用来训练模型的数据被送到同一个模型时(但由于我正在预测当前模型的下一个字符,批量大小为1),我在整个文本中得到的精度非常低对于列车模型的准确性。
我知道我做错了什么但不能指责它。 有人可以就如何改进我的模型向我提出任何指示吗?
代码(使用Tensorflow 1.2):
from __future__ import division, print_function
import tensorflow as tf
import numpy as np
import os, sys, argparse
def dummy_sequence(dummy_length):
""" generate a round of dummy sequences
"""
seq = ""
p_abc_ori = np.asarray([0.7, 0.2, 0.1])
p_abc_trans = np.asarray(
[[0.6, 0.3, 0.1],
[0.3, 0.5, 0.2],
[0.8, 0.1, 0.1]])
chars = ["a", "b", "c"]
positions = {"a": 0, "b": 1, "c": 2}
c = np.random.choice(chars, p=p_abc_ori)
seq += c
for i in range(dummy_length):
c = np.random.choice(chars, p = p_abc_trans[positions[c]])
seq += c
return seq
data_seqs = dummy_sequence(40000)
data_length = len(data_seqs)
char_set = set()
for ch in data_seqs:
char_set.add(ch)
char_list = sorted(list(char_set))
char2idx = dict(zip(char_list, range(len(char_list))))
idx2char = dict(zip(range(len(char_list)), char_list))
def sample_generator(data_seqs, char_dict, batch_size, sequence_length):
data_length = len(data_seqs)
length = sequence_length + 1
num_steps = (data_length // batch_size)
for step in range(num_steps):
start_idxs = np.random.random_integers(0, data_length, batch_size)
input_batch = np.zeros((batch_size, sequence_length), dtype=np.int32)
target_batch = np.zeros((batch_size, sequence_length), dtype=np.int32)
for i, start_idx in enumerate(start_idxs):
sample = [char_dict[data_seqs[i % data_length]] for i in range(start_idx, start_idx+length)]
input_batch[i, :] = sample[0:sequence_length]
target_batch[i, :] = sample[1:sequence_length+1]
start_idxs = (start_idxs + sequence_length) % data_length
yield input_batch, target_batch
# parameters
sequence_length = 150
batch_size = 200
number_of_characters = len(char_set)
hidden_size = 512
dropout = 0.8
learning_rate = 2e-3
class Model:
def __init__(self, batch_size, sequence_length, hidden_size,
number_of_characters, learning_rate, dropout,
is_training=False):
self.batch_size = batch_size
self.sequence_length = sequence_length
self.hidden_size = hidden_size
self.learning_rate = learning_rate
self.dropout = dropout
self.number_of_characters = number_of_characters
# placeholder for X and Y
self._inputs = tf.placeholder(tf.int32, [self.batch_size, self.sequence_length], name ="input")
self._targets = tf.placeholder(tf.int32, [self.batch_size, self.sequence_length], name="target")
one_hot_inputs = tf.one_hot(self._inputs, depth=self.number_of_characters)
# Bi-LSTM
cell_fw = tf.contrib.rnn.DropoutWrapper(
tf.contrib.rnn.LSTMCell(self.hidden_size, state_is_tuple=True),
output_keep_prob=self.dropout)
cell_bw = tf.contrib.rnn.DropoutWrapper(
tf.contrib.rnn.LSTMCell(self.hidden_size, state_is_tuple=True),
output_keep_prob=self.dropout)
self._initial_state_fw = cell_fw.zero_state(self.batch_size, tf.float32)
self._initial_state_bw = cell_bw.zero_state(self.batch_size, tf.float32)
lstm_output, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,
one_hot_inputs,
initial_state_fw=self.initial_state_fw,
initial_state_bw=self.initial_state_bw)
lstm_output_fw, lstm_output_bw = lstm_output
final_state_fw, final_state_bw = final_state
# concatenate fw and bw layer
lstm_output = tf.concat(lstm_output, axis=2)
# apply dense to reshape
lstm_dense = tf.layers.dense(inputs=lstm_output, units=self.hidden_size, activation=tf.nn.tanh)
# concatenate with input
lstm_output = tf.concat((lstm_dense, one_hot_inputs), axis=2)
# apply dense to reshape
lstm_output = tf.layers.dense(lstm_output, units=self.number_of_characters, activation=tf.nn.softmax)
# compute logits and probabilities
self._logits_flat = tf.reshape(lstm_output, (-1, self.number_of_characters))
probabilities_flat = tf.nn.softmax(self.logits_flat)
self._probabilities = tf.reshape(probabilities_flat, (self.batch_size, -1, self.number_of_characters))
targets_flat = tf.reshape(self.targets, (-1, ))
correct_pred = tf.equal(tf.argmax(probabilities_flat, 1), tf.cast(tf.round(targets_flat), tf.int64))
# compute accuracy
self._accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
if not is_training:
return
# compute loss
self._loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_flat, labels=targets_flat)
self._cost = tf.reduce_mean(self.loss)
# optimizer
trainable_variables = tf.trainable_variables()
optimizer = tf.train.AdamOptimizer(self.learning_rate)
gradients = tf.gradients(self.loss, trainable_variables)
gradients, _ = tf.clip_by_global_norm(gradients, 5)
self._train_op = optimizer.apply_gradients(zip(gradients, trainable_variables))
@property
def inputs(self):
return self._inputs
@property
def targets(self):
return self._targets
@property
def initial_state_fw(self):
return self._initial_state_fw
@property
def initial_state_bw(self):
return self._initial_state_bw
@property
def logits_flat(self):
return self._logits_flat
@property
def probabilities(self):
return self._probabilities
@property
def accuracy(self):
return self._accuracy
@property
def loss(self):
return self._loss
@property
def cost(self):
return self._cost
@property
def train_op(self):
return self._train_op
########################
# Train
outputdir = "./tmp"
if not os.path.isdir(outputdir):
os.makesdir(outputdir)
model = Model(batch_size, sequence_length, hidden_size, number_of_characters, learning_rate, dropout, True)
save_path = os.path.join(outputdir, 'model')
saver = tf.train.Saver(tf.trainable_variables())
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
state_fw = sess.run(model.initial_state_fw)
state_bw = sess.run(model.initial_state_bw)
for epoch in range(5):
all_acc = list()
all_loss = list()
for input_batch, target_batch in sample_generator(data_seqs, char2idx, batch_size, sequence_length):
feed_dict = {model.inputs: input_batch,
model.targets: target_batch}
computed_cost, computed_accuracy, _ = \
sess.run([model.cost,
model.accuracy,
model.train_op],
feed_dict=feed_dict)
all_loss.append(computed_cost)
all_acc.append(computed_accuracy)
#print(sum(all_loss), sum(all_acc))
print('i: {}, loss: {}, accuracy: {}'.format(epoch,
sum(all_loss)/len(all_loss),
sum(all_acc)/len(all_acc)))
# print
# i: 0, loss: 0.6021944493055343, accuracy: 0.9481146431714297
# i: 1, loss: 0.554149004817009, accuracy: 0.9972950986027718
# i: 2, loss: 0.5541560265421868, accuracy: 0.9972904279828072
# i: 3, loss: 0.5541570243239403, accuracy: 0.9972894296050072
# i: 4, loss: 0.5541545218229293, accuracy: 0.9972919332981109
saver.save(sess, save_path)
########################
# Predict on trained variable
tf.reset_default_graph()
latest_checkpoint = tf.train.latest_checkpoint(outputdir)
model = Model(1, None, hidden_size, number_of_characters, learning_rate, dropout)
saver = tf.train.Saver(tf.trainable_variables())
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
saver.restore(sess, latest_checkpoint)
all_acc = list()
for idx, character in enumerate(data_seqs[:-1]):
idx_target = char2idx[data_seqs[idx+1]]
idx_query = char2idx[character]
feed_dict={model.inputs: np.asarray([[idx_query]]),
model.targets: np.asarray([[idx_target]])
}
out, acc = sess.run([model.probabilities,
model.accuracy],
feed_dict=feed_dict)
all_acc.append(acc)
print("Global accuracy: {}".format(sum(all_acc)/len(all_acc)))
# print:
# Global accuracy: 0.594775
答案 0 :(得分:0)
看起来您在没有指示变量重用的情况下两次调用Model()(例如,使用带有reuse=True
的{{3}})。这是一个选项,但我建议使用面向对象的Layer
s tf.variable_scope
来构建代码。这样,变量与Layer对象一起存在,并且重用很容易理解。 (请注意,您需要使用更新版本的TensorFlow才能使用tf.keras.Model。)
还有一个"中途"解决方案as in the official MNIST example,它允许您使用功能层(tf.layers.dense
,tf.layers.Dense
作为面向对象的版本),同时自动管理重用;变量与模板对象一起存在。