这是我第一次尝试从论文中实现网络体系结构,并且发现了资源耗尽的错误。我需要指出的是,我尝试使用的是来自tensorflow的不同网络(种类相同的层数),但没有出现任何错误,训练过程大约需要30分钟。
我正在使用的体系结构如下(该函数包含体系结构,并使用tf.estimator
进行调用):
def instrument_recognition_model(features, labels, mode):
"""
Parameters:
features: the array containing the examples used for training
labels: the array containing the labels of the examples in one-hot representation
mode: a tf.estimator.ModeKeys like tf.estimator.ModeKeys.TRAIN or tf.estimator.ModeKeys.PREDICT
"""
tf.logging.set_verbosity(tf.logging.INFO)
# Input layer
input_layer = tf.reshape(features, [-1, features.shape[1], features.shape[2], 1])
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[3, 3],
strides=1,
padding="same",
activation=tf.nn.relu)
print('Shape Conv 1', conv1.shape)
conv2 = tf.layers.conv2d(
inputs=conv1,
filters=32,
kernel_size=[3, 3],
strides=1,
padding="same",
activation=tf.nn.relu)
print('Shape Conv 2', conv2.shape)
pool1 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[3, 3], strides=1)
print('Shape Pool 1', pool1.shape)
dropout1 = tf.layers.dropout(inputs=pool1, rate=0.25, training=mode == tf.estimator.ModeKeys.TRAIN)
print('Shape Dropout 1', dropout1.shape)
conv3 = tf.layers.conv2d(
inputs=dropout1,
filters=64,
kernel_size=[3, 3],
strides=1,
padding="same",
activation=tf.nn.relu)
print('Shape Conv 3', conv3.shape)
conv4 = tf.layers.conv2d(
inputs=conv3,
filters=64,
kernel_size=[3, 3],
strides=1,
padding="same",
activation=tf.nn.relu)
print('Shape Conv 4', conv4.shape)
pool2 = tf.layers.max_pooling2d(inputs=conv4, pool_size=[3, 3], strides=1)
print('Shape Pool 2', pool2.shape)
dropout2 = tf.layers.dropout(inputs=pool2, rate=0.25, training=mode == tf.estimator.ModeKeys.TRAIN)
print('Shape Dropout 2', dropout2.shape)
conv5 = tf.layers.conv2d(
inputs=dropout2,
filters=128,
kernel_size=[3, 3],
strides=1,
padding="same",
activation=tf.nn.relu)
print('Shape Conv 5', conv5.shape)
conv6 = tf.layers.conv2d(
inputs=conv5,
filters=128,
kernel_size=[3, 3],
strides=1,
padding="same",
activation=tf.nn.relu)
print('Shape Conv 6', conv6.shape)
pool3 = tf.layers.max_pooling2d(inputs=conv6, pool_size=[3, 3], strides=1)
print('Shape Pool 3', pool3.shape)
dropout3 = tf.layers.dropout(inputs=pool3, rate=0.25, training=mode == tf.estimator.ModeKeys.TRAIN)
print('Shape Dropout 3', dropout3.shape)
conv7 = tf.layers.conv2d(
inputs=dropout3,
filters=256,
kernel_size=[3, 3],
strides=1,
padding="same",
activation=tf.nn.relu)
print('Shape Conv 7', conv7.shape)
conv8 = tf.layers.conv2d(
inputs=conv7,
filters=256,
kernel_size=[3, 3],
strides=1,
padding="same",
activation=tf.nn.relu)
print('Shape Conv 8', conv8.shape)
global_maxpool = tf.layers.max_pooling2d(inputs=conv8, pool_size=[3, 3], strides=1)
print('Shape Global Max Pool', global_maxpool.shape)
flat = tf.reshape(global_maxpool, [-1, np.prod(global_maxpool.shape[1:])])
print('Shape Flattened', flat.shape)
dense = tf.layers.dense(inputs=flat, units=1024, activation=tf.nn.relu)
print('Shape Fully Connected 1', dense.shape)
dropout = tf.layers.dropout(inputs=dense, rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
print('Shape Dropout', dropout.shape)
# Logits Layer
logits = tf.layers.dense(inputs=dropout, units=labels.shape[1])
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
logging_hook = tf.train.LoggingTensorHook({"loss" : loss}, every_n_iter=10)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks = [logging_hook])
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=tf.argmax(input=labels, axis=1),
predictions=predictions["classes"])
}
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
我尝试过使用动量优化器,因为它占用了 Adam 的一半内存,并且我尝试减小了批量大小(没有什么变化)。
我正在处理一个多分类问题,我试图将类的数量从10个减少到3个(不变)。
我尝试脱掉不同的层,但仍然有相同的问题。
GPU :具有Google Cloud 11GB内存的K80
据我从网络上了解到的是体系结构问题,但我不知道出了什么问题。
我将在第4页上附上描述数据集和要实施的网络的paper,以大致了解问题所在。
提前谢谢