在下面的类中多次调用train_model
后,出现以下错误:
terminate called after throwing an instance of 'std::system_error'
what(): Resource temporarily unavailable
我希望能够在实例化类之后以固定的train_model
和不同的num_steps
多次调用model_dir
。
(我认为某个地方存在内存泄漏,但我无法找出原因(每次调用train_model
后,GPU内存使用率都不会改变,但是每次使用之后,RAM使用率都会略有增加发生错误时,机器上仍然有大量RAM和GPU内存。)-> 更新:这不是内存泄漏,而是打开了太多文件
我在另一台计算机上运行此错误,错误更明显: ResourceExhausted:打开的文件太多
我看着lsof
,发现在每次使用不同检查点目录的events.out.tfevents
调用之后,TF都会打开train_model
个文件。知道如何在调用tf.estimator之后关闭events.out.tfevents
文件吗?
这是我的代码(一个简单的前馈NN,具有辍学和批量归一化分类):
class Model:
def __init__(self):
self.data_loaded = False
self.train_data = None
self.valid_data = None
print('class created!')
def input_fn(self, mode, batch_size, num_epochs=None):
if mode == 'train':
features_cont = self.data_dict['x_train_cont']
features_cat = self.data_dict['x_train_cat']
labels = self.data_dict['y_train']
elif mode == 'valid':
features_cont = self.data_dict['x_valid_cont']
features_cat = self.data_dict['x_valid_cat']
labels = self.data_dict['y_valid']
num_epochs = 1
elif mode == 'test':
features_cont = self.data_dict['x_test_cont']
features_cat = self.data_dict['x_test_cat']
labels = np.zeros([features_cont.shape[0], 1])
num_epochs = 1
features = np.concatenate([features_cont, features_cat], axis=1)
shuffle = mode == 'train'
return tf.estimator.inputs.numpy_input_fn(features, labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle)
def model_fn(self, features, labels, mode, params):
is_training = mode == tf.estimator.ModeKeys.TRAIN
# concat cont and cat features
l2_weights = params['l2_w']
dropout_rates = params['dropout']
n_classes = 2 # binary classification
x_in = features #tf.concat(features, axis=1)
hidden_layer = x_in
for i, num_units in enumerate(params['num_units']):
hidden_layer = tf.layers.dense(inputs=hidden_layer,
units=num_units,
name='hidden_{}'.format(i),
kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=l2_weights[i]))
hidden_layer = tf.layers.batch_normalization(hidden_layer, training=is_training)
hidden_layer = tf.nn.relu(hidden_layer)
hidden_layer = tf.layers.dropout(inputs=hidden_layer,
rate=dropout_rates[i],
name='hidden_drop_{}'.format(i),
training=is_training)
logits = tf.layers.dense(inputs=hidden_layer,
units=n_classes,
name='output')
predictions = tf.nn.softmax(logits, name='probability_predictions')
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode,
predictions={'predictions': predictions},
# export_outputs=export_outputs
)
weights = tf.gather(tf.constant(self.class_weights), tf.cast(labels[:, 1], tf.int32))
l2_loss = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
loss = tf.losses.softmax_cross_entropy(labels, logits, weights) + l2_loss
auc = tf.metrics.auc(labels[:, 1], predictions[:, 1])
eval_metric_ops = {'auc': auc}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss,
eval_metric_ops=eval_metric_ops)
assert mode == tf.estimator.ModeKeys.TRAIN
# needed for batch norm layer
extra_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
global_step = tf.train.get_global_step()
optimizer = tf.train.AdamOptimizer(learning_rate=params['learning_rate'], epsilon=1e-07)
with tf.control_dependencies(extra_ops):
train_op = optimizer.minimize(loss, global_step=global_step)
# Set logging hook for tf.estimator
logging_hook = tf.train.LoggingTensorHook({'step': global_step,
'loss': loss,
#'auc': auc[1]
},
every_n_iter=1)
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
train_op=train_op,
training_hooks=[logging_hook])
def train_model(self, hps, model_dir=None, num_steps=None):
max_steps = None
num_epochs = None
# get TF logger
tf.logging.set_verbosity(tf.logging.INFO)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
self.setup_tf_logger(model_dir)
config = tf.ConfigProto(allow_soft_placement=True,
log_device_placement=False)
config.gpu_options.allow_growth = True
run_config = tf.estimator.RunConfig(
save_checkpoints_steps=1e10,
keep_checkpoint_max=1,
model_dir=model_dir,
session_config=config
)
batch_size = hps['batch_size']
if self.train_data is None:
self.train_data = self.input_fn(mode='train',
batch_size=batch_size,
num_epochs=num_epochs)
self.valid_data = self.input_fn(mode='valid',
batch_size=100000,
num_epochs=1)
model = tf.estimator.Estimator(model_fn=self.model_fn,
params=hps,
config=run_config)
model.train(input_fn=self.train_data,
steps=num_steps,
max_steps=None)
eval_out = model.evaluate(input_fn=self.valid_data)
return eval_out['auc']
我不得不更改TF代码以解决此问题。
当前在basic_session_run_hooks.py
和estimator.py
中,flush()
在摘要编写器上被调用,该摘要编写器仅转储数据但不关闭文件。
我将对摘要编写器的调用更改为close()
,而不是flush()
。调用tf.estimator后,文件似乎已关闭,并且我不再遇到ResourceExhausted错误。
肯定有一个原因(可能是打开和关闭文件的成本),使tensorflow团队在摘要编写器上使用了flush()
而不是close()
,但这可能导致与我所报告的问题类似的问题。在这里。