我在python中使用tensorflow-gpu==1.7.0
,在保存和恢复经过训练的神经网络方面遇到了令人尴尬的困难。
基本上,我想在训练网络时自动保存它,并在将其用于推理(检测)时自动加载它。这是我的代码的两个主要方法:
def train(
self,
data_frame: pd.DataFrame,
travel_mode_column: str = 'target',
shuffle: bool = True,
batch_size: int = 100,
max_epochs: int = 100,
learning_rate: float = 0.01,
beta1: float =0.9,
beta2: float = 0.999,
epsilon: float = 1e-08,
n_hidden: int = None,
display_step: int = 20,
train_ratio: float = 0.8,
val_ratio: float = 0.2,
test_ratio: float = 0.0,
one_hot_encode: bool = False,
fill_nan_with_mean: bool = True,
convert_modes_to_numbers: bool = True,
**kwargs
):
"""
Pre process input and train an Artificial Neural Network for travel mode detection
:param data_frame:
:param travel_mode_column:
:param shuffle:
:param batch_size:
:param learning_rate:
:param beta1:
:param beta2:
:param epsilon:
:param max_epochs:
:param n_hidden:
:param display_step:
:param train_ratio:
:param val_ratio:
:param test_ratio:
:param one_hot_encode:
:param fill_nan_with_mean:
:param convert_modes_to_numbers:
:param kwargs:
:return:
"""
# partition data frame into train, validation and test
x_test, x_train, x_val, y_test, y_train, y_val = self.get_preprocessed_partitions(
data_frame=data_frame,
travel_mode_column=travel_mode_column,
shuffle=shuffle,
one_hot_encode=one_hot_encode,
fill_nan_with_mean=fill_nan_with_mean,
convert_modes_to_numbers=convert_modes_to_numbers,
train_ratio=train_ratio,
val_ratio=val_ratio,
test_ratio=test_ratio
)
# get parameters
n_samples = x_train.shape[0]
n_features = x_train.shape[1]
if n_hidden is None:
n_hidden = n_features
batch_size = min(batch_size, n_samples)
labels = len(list(data_frame[travel_mode_column].unique()))
# Define Input Layer
input_x = tf.placeholder(tf.float32, [None, n_features], name='input_x')
input_y = tf.placeholder(tf.int32, [None], name='input_y')
# Define First Hidden Layer
w1 = tf.Variable(tf.random_normal([n_features, n_hidden]))
b1 = tf.Variable(tf.zeros([n_hidden]))
h1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(input_x, w1), b1))
# Define Second Hidden Layer
w2 = tf.Variable(tf.random_normal([n_hidden, n_hidden]))
b2 = tf.Variable(tf.zeros([n_hidden]))
h2 = tf.nn.relu(tf.nn.bias_add(tf.matmul(h1, w2), b2))
# Define Output layer
w3 = tf.Variable(tf.random_normal([n_hidden, labels]))
b3 = tf.Variable(tf.zeros([labels]))
logits = tf.nn.bias_add(tf.matmul(h2, w3), b3)
# Define Loss Function
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits,
labels=input_y
)
mean_cross_entropy = tf.reduce_mean(cross_entropy, name='loss')
tf.summary.histogram('loss', mean_cross_entropy)
# Define Optimizer
optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate,
beta1=beta1,
beta2=beta2,
epsilon=epsilon
)
train_op = optimizer.minimize(mean_cross_entropy)
# Define Predict Function
prediction = tf.cast(tf.argmax(logits, 1), tf.int32, name='prediction')
tf.summary.histogram('prediction', prediction)
# Define Evaluation Function
accuracy = tf.cast(tf.equal(prediction, input_y), tf.float32) # Needs to be float to round the mean
mean_accuracy = tf.reduce_mean(accuracy, name='accuracy')
tf.summary.histogram('accuracy', mean_accuracy)
# Initializing tf session
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as session:
session.run(init)
train_writer = tf.summary.FileWriter(os.path.join(self.model_save_path, 'logs'), session.graph)
# Training cycle
for epoch in range(max_epochs):
epoch_loss = 0.0
epoch_accuracy = 0.0
n_batches = int(n_samples / batch_size)
merge = tf.summary.merge_all()
# Loop over all batches
for i in range(n_batches):
batch_index = np.random.choice(n_samples, batch_size, replace=False)
batch_x = x_train[batch_index]
batch_y = y_train[batch_index].flatten()
# Run optimization op (backprop) and cost op (to get loss value)
summary, _, loss, epoch_accuracy = session.run(
fetches=[merge, train_op, mean_cross_entropy, mean_accuracy],
feed_dict={input_x: batch_x, input_y: batch_y}
)
# Compute average loss and save summary
epoch_loss += loss / n_batches
train_writer.add_summary(summary, i)
# Display logs per epoch step
if epoch % display_step == 0:
print(
"Epoch:", '%03d' % epoch,
"loss={:.6f}".format(epoch_loss),
"accuracy={:.4f}".format(epoch_accuracy)
)
batch_x = x_train
batch_y = y_train.flatten()
predicted, epoch_accuracy = session.run(
fetches=[prediction, mean_accuracy],
feed_dict={input_x: batch_x, input_y: batch_y}
)
print("Final accuracy on train set:", epoch_accuracy)
batch_x = x_test
batch_y = y_test.flatten()
predicted, epoch_accuracy = session.run(
fetches=[prediction, mean_accuracy],
feed_dict={input_x: batch_x, input_y: batch_y}
)
print("Final accuracy on validation set:", epoch_accuracy)
# save model and classes mapping to numbers
saver.save(session, os.path.join(self.model_save_path, 'model'))
joblib.dump(self.classes, os.path.join(self.model_save_path, self.CLASSES_FILENAME))
def detect(self, data_frame: pd.DataFrame, batch_size=200, verbose=0):
"""
Detect travel mode of samples in a dataframe
:param data_frame:
:param batch_size:
:param verbose:
:return:
"""
# restore checkpoint
checkpoint = tf.train.latest_checkpoint(self.model_save_path)
tf.reset_default_graph()
with tf.Session() as session:
session.run(tf.global_variables_initializer())
# restore graph
graph = tf.get_default_graph()
saver = tf.train.import_meta_graph(checkpoint + ".meta")
# load model
saver.restore(session, checkpoint)
prediction = graph.get_tensor_by_name('prediction:0')
input_x = graph.get_tensor_by_name('input_x:0')
# load classes mapping to numbers
self.classes = joblib.load(os.path.join(self.model_save_path, self.CLASSES_FILENAME))
print(str(self.classes))
self.classes2string = {}
self.classes2number = {}
for i in range(len(self.classes)):
c = self.classes[i]
self.classes2string[i] = c
self.classes2number[c] = i
print(str(self.classes2string))
print(str(self.classes2number))
# preprocess input data
assert data_frame.shape[1] == input_x.shape[1], \
'dataframe must have same shape of trained model input!'
data_frame = self.fill_nan_with_mean(data_frame)
# detect modes in batches
samples = data_frame.values
n_samples = samples.shape[0]
n_batches = int(n_samples / batch_size)
predictions = list()
for batch_index in range(n_batches + 1):
batch_begin = batch_index * batch_size
batch_end = min(batch_begin + batch_size, n_samples)
batch_x = samples[batch_begin:batch_end]
predicted = session.run(
fetches=[prediction],
feed_dict={input_x: batch_x}
)
predictions.extend(predicted[0].tolist())
unique, counts = np.unique(predictions, return_counts=True)
print(np.asarray((unique, counts)))
# convert modes numbers to labels
predictions = pd.DataFrame(predictions)
predictions = self.convert_numbers_to_classes(predictions)
return predictions
我的问题是,当我在同一个python进程中执行这两个方法时,一切正常,但是当我尝试在另一个进程中仅调用detect
方法时,网络的输出将变得完全随机。
我已经检查过tensorflow变量是否被正确恢复,并且在两种情况下它们都是相同的,但是当我仅进行检测时,输出就完全混乱了。
以下是我在同一过程中进行训练和检测的结果:
{'Bus': 0, 'Car': 1, 'Still': 2, 'Train': 3, 'Walking': 4}
[[ 0 1 2 3]
[3262 1055 1242 1002]]
Full Data Accuracy 0.5941167504953513
以下是我仅使用经过训练的模型在其他过程中检测到的结果:
{'Bus': 0, 'Car': 1, 'Still': 2, 'Train': 3, 'Walking': 4}
[[ 1 2 3]
[ 529 6030 2]]
Full Data Accuracy 0.2217649748513946
我已经在这个问题上浪费了很多时间。最初我虽然缺少明显的东西,但是尝试了一些保存和恢复教程(包括正式的tensorflow文档中的教程)后,我无法使其正常工作。
如果有人能指出我在这里想念的地方,我将感到非常高兴。
谢谢大家!
最好的问候,