我的tensorflow代码在运行过程中,ROM内存不断增加,最终导致内存崩溃

时间:2018-03-01 02:34:41

标签: python tensorflow

这是我的代码,我尝试了几种方法无法解决,想知道这样做的好方法是什么,最好的方法是什么?

我一直怀疑我在某些地方添加了新节点,但我找不到原因。这个问题我以前从未遇到过,我也尝试手动清除内存,但没有成功。有一个问题,运行这段代码时,我的GPU工作效率只有2%,可以提高GPU的效率吗?

import tensorflow as tf
import numpy as np
import sklearn.metrics
import copy
import time
import gc


class DataDeal():
    def __init__(self, batch_size):
        self._batch_size = batch_size

    def readTfrecord(self, file, epoch=None, isTrain=True):
        fileQueue = tf.train.string_input_producer(string_tensor=[file], num_epochs=epoch, shuffle=True)
        reader = tf.TFRecordReader()
        _, example_series = reader.read(queue=fileQueue)
        features = tf.parse_single_example(serialized=example_series,
                                           features={"label": tf.FixedLenFeature([], tf.string),
                                                     "data_raw": tf.FixedLenFeature([], tf.string)})
        data = tf.decode_raw(features["data_raw"], out_type=tf.float32)
        label = tf.decode_raw(features["label"], out_type=tf.int32)
        data = tf.reshape(data, shape=(1, 600))
        label = tf.reshape(label, shape=(1, 2))
        if isTrain:
            data, label = tf.train.shuffle_batch([data, label], batch_size=self._batch_size, capacity=2000,
                                                 min_after_dequeue=500, num_threads=3)
        else:
            assert epoch == None, "wrong!"
            data, label = tf.train.batch([data, label], batch_size=500)
        return (data, label)


class DNN(DataDeal):
    def __init__(self, layer_shape, epoch, eta, batch_size, norm=True, L2_loss=True):
        super(DNN, self).__init__(batch_size)
        self._layer_shape = [600] + layer_shape + [2]
        self.batch_size = batch_size
        self._norm = norm
        self.L2_loss = L2_loss
        self.eta = eta
        self.epoch = epoch
        with tf.name_scope("input"):
            self.x = tf.placeholder(dtype=tf.float32, shape=(None, 600), name="input_X")
            self.y = tf.placeholder(dtype=tf.int32, shape=(None, 2), name="label")
            self.keep_pro = tf.placeholder(dtype=tf.float32, name="keep_pro")
            self._y = tf.cast(self.y, dtype=tf.float32)
        with tf.variable_scope("dnn"):
            self._W = [tf.get_variable(name="layerW_%d" % index, shape=(x[0], x[1]), dtype=tf.float32,
                                       initializer=tf.truncated_normal_initializer())
                       for index, x in enumerate(list(zip(self._layer_shape[:-1], self._layer_shape[1:])))]
            self._B = [tf.get_variable(name="layerB_%d" % index, shape=(1, x1), dtype=tf.float32,
                                       initializer=tf.truncated_normal_initializer())
                       for index, x1 in enumerate(self._layer_shape[1:])]
            self.global_step = tf.get_variable(name="global_step", dtype=tf.int32, initializer=0,
                                               trainable=False)  # GLOBAL STEP

    def batch_normalization(self, input_):
        mean, var = tf.nn.moments(input_, [0, 1], keep_dims=True)
        shift = tf.get_variable(shape=[1, input_.get_shape().as_list()[-1]], dtype=tf.float32,
                                initializer=tf.zeros_initializer(), name="shift_1")
        scale = tf.get_variable(shape=[1, input_.get_shape().as_list()[-1]], dtype=tf.float32,
                                initializer=tf.constant_initializer(1.0), name="scale_1")
        # shift= tf.Variable(initial_value=tf.truncated_normal(shape=[1, input_.get_shape().as_list()[-1]],dtype=tf.float32))
        epsilon = 1e-3
        output = tf.nn.batch_normalization(input_, mean, var, shift, scale, epsilon)
        return output

    def run(self):
        first_output = tf.add(tf.matmul(self.x, self._W[0]), self._B[0])
        if self._norm:
            first_output = tf.nn.relu(self.batch_normalization(input_=first_output))  # relu+BN
        else:
            first_output = tf.nn.sigmoid(first_output)
        for i in range(1, len(self._W) - 1):
            if self._norm:
                first_output = tf.add(tf.matmul(first_output, self._W[i]), self._B[i])
                with tf.variable_scope("layer%d" % i):
                    first_output = tf.nn.relu(self.batch_normalization(input_=first_output))
            else:
                first_output = tf.sigmoid(tf.add(tf.matmul(first_output, self._W[i]), self._B[i]))
        first_output = tf.nn.dropout(first_output, keep_prob=self.keep_pro)
        last_output = tf.add(tf.matmul(first_output, self._W[-1]), self._B[-1])
        return last_output


if __name__ == "__main__":

    file_train = "D:/traindata/XIEBO/train.tfrecords"
    file_test = "D:/traindata/XIEBO/test.tfrecords"
    dnn_object = DNN(layer_shape=[512, 128],
                     epoch=5000,
                     eta=0.001,
                     batch_size=128,
                     norm=False,
                     L2_loss=True)
    data, label = dnn_object.readTfrecord(file=file_train, epoch=dnn_object.epoch,
                                          isTrain=True)
    data_test, label_test = dnn_object.readTfrecord(file=file_test, epoch=None, isTrain=False)
    output = tf.nn.softmax(dnn_object.run())
    tvars = copy.copy(tf.trainable_variables())
    loss = tf.nn.softmax_cross_entropy_with_logits(labels=dnn_object.y, logits=output)
    if dnn_object.L2_loss:
        loss_l2 = 0.0005 * tf.reduce_sum([tf.nn.l2_loss(x) for x in tvars])
        loss += loss_l2
    train_first_op = tf.train.GradientDescentOptimizer(learning_rate=dnn_object.eta).minimize(loss=loss,
                                                                                              global_step=dnn_object.global_step)
    variable_averages = tf.train.ExponentialMovingAverage(decay=0.999, num_updates=dnn_object.global_step)
    variable_averages_op = variable_averages.apply(tf.trainable_variables())
    with tf.control_dependencies([train_first_op]):
        train_step = tf.group(variable_averages_op)
    with tf.Session() as sess:
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        # sess.graph.finalize()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        data_test, label_test = sess.run([data_test, label_test])
        data_test = np.reshape(data_test, newshape=(-1, 600))
        label_test = np.reshape(label_test, newshape=(-1, 2))
        step = 0
        try:
            while not coord.should_stop():
                data_batch, label_batch = sess.run([data, label])
                data_batch = np.reshape(data_batch, newshape=(-1, 600))
                label_batch = np.reshape(label_batch, newshape=(-1, 2))
                _, pred_label, loss_look = sess.run([train_step, output, tf.reduce_mean(loss)], feed_dict={
                    dnn_object.x: data_batch, dnn_object.y: label_batch, dnn_object.keep_pro: 0.8})
                if (step != 0) & (step % 100 == 0):
                    print("now is step %d, trainning acc is %s, trainning loss is %s" % (
                        step, np.mean(np.equal(np.argmax(pred_label, axis=1), np.argmax(label_batch, axis=1))),
                        loss_look
                    ))
                    del data_batch, label_batch, pred_label, loss_look
                    gc.collect()
                    pred_label, loss_look = sess.run([output, tf.reduce_mean(loss)], feed_dict={
                        dnn_object.x: data_test, dnn_object.y: label_test, dnn_object.keep_pro: 1.0})
                    print("now is step %d, testing acc is %s, testing loss is %s" % (
                        step, np.mean(np.equal(np.argmax(pred_label, axis=1), np.argmax(label_test, axis=1))),
                        loss_look
                    ))
                    del pred_label, loss_look
                    gc.collect()
                    print(step, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))
                else:
                    del data_batch, label_batch, pred_label, loss_look
                    gc.collect()
                step += 1
        except tf.errors.OutOfRangeError as e:
            print(e, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))
        finally:
            coord.request_stop()
            coord.join(threads=threads)
            print("over", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))

1 个答案:

答案 0 :(得分:0)

我只是解决了这个问题。原因是我使用这段代码:

 _, pred_label, loss_look = sess.run([train_step, output, tf.reduce_mean(loss)], feed_dict={dnn_object.x: data_batch, dnn_object.y: label_batch, dnn_object.keep_pro: 0.8})\

并且当我训练我的模型时,tf.reduce_mean(loss)已经构建了一个新的Variable或Opt每个循环。 现在我想知道如何提高GPU效率?或者它可能是不能人为改进的。