张量流量突然暴露

时间:2017-06-21 06:50:47

标签: machine-learning tensorflow

您好我使用Tensorflow进行机器学习任务(回归)

我使用普通神经网络并使用ELU作为激活功能。

但是每当我使用ELU作为激活功能时,突然损失会在某个时刻增加。

使用sigmoid激活,这不会发生。

设置

  1. 任务:回归
  2. 学习率= 0.05
  3. optimizer = Adam
  4. 损失:MSE
  5. 网络结构:普通NN,带有4个隐藏层
  6. 整个代码并在下面记录!

    • 并且有关于纪元64的更奇怪的事情,我不确定为什么再次打印GPU日志。这种情况每次都会发生。

    型号代码

    class DeepModel:
    
        def __init__(self, learning_rate, batch_size, neighbor, weight_decay = 0.9, huber_delta=0.3, keep_prob_lst=[]):
            """ hyperparameters """
            self.isConv = False
            self.batch_size = batch_size
            self.lr = learning_rate
            self.input_size = neighbor * 3
            self.output_size = 1
            self.weight_decay = weight_decay
            self.layer1_size = 300
            self.layer2_size = 300
            self.layer3_size = 300
            self.layer4_size = 300
            self.huber_delta = huber_delta
            self.keep_prob_lst_val = keep_prob_lst
            self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
    
        def _create_placeholders(self):
            """ define the placeholders for input and output """
            with tf.name_scope("data"):
                self.input = tf.placeholder(tf.float32, shape = [self.batch_size, self.input_size], name='input')
                self.output = tf.placeholder(tf.float32, shape= [self.batch_size, self.output_size], name='output')
            with tf.name_scope("keep_prob"):
                self.keep_prob_lst = [ tf.placeholder(tf.float32, name='layer{}_keep_prob'.format(idx+1)) for idx in range(len(self.keep_prob_lst_val))]
    
        def _create_weights(self):
            """ define weights. """
            # Assemble this part of the graph on the CPU. You can change it to GPU if you have GPU
            with tf.name_scope("weights"):
                self.W_1 = tf.Variable(tf.random_normal([self.input_size, self.layer1_size], stddev=0.01, mean=0.0, seed=0), name='layer1_weight')
                self.b_1 = tf.Variable(tf.zeros([1,self.layer1_size]), name='layer1_bias')
    
                self.W_2 = tf.Variable(tf.random_normal([self.layer1_size, self.layer2_size], stddev=0.01, mean=0.0, seed=0), name='layer2_weight')
                self.b_2 = tf.Variable(tf.zeros([1,self.layer2_size]), name='layer2_bias')
    
                self.W_3 = tf.Variable(tf.random_normal([self.layer2_size, self.layer3_size], stddev=0.01, mean=0.0, seed=0), name='layer2_weight')
                self.b_3 = tf.Variable(tf.zeros([1,self.layer3_size]), name='layer2_bias')
    
                self.W_4 = tf.Variable(tf.random_normal([self.layer3_size, self.layer4_size], stddev=0.01, mean=0.0, seed=0), name='layer2_weight')
                self.b_4 = tf.Variable(tf.zeros([1,self.layer4_size]), name='layer2_bias')
    
                self.W_out = tf.Variable(tf.random_normal([self.layer4_size, self.output_size], stddev=0.01, mean=0.0, seed=0), name='layer_out_weight')
                self.b_out = tf.Variable(tf.zeros([1,self.output_size]), name='layer_out_bias')
    
        def _create_loss(self):
            """ define the inference + the loss function """
            with tf.name_scope("loss"):
                self.layer1_output = tf.nn.elu(tf.matmul(self.input, self.W_1) + self.b_1)
    
                self.layer2_output = tf.nn.elu(tf.matmul(self.layer1_output, self.W_2) + self.b_2)
    
                self.layer3_output = tf.nn.elu(tf.matmul(self.layer2_output, self.W_3) + self.b_3)
    
                self.layer4_output = tf.nn.elu(tf.matmul(self.layer3_output, self.W_4) + self.b_4)
    
                self.layer_out_output = tf.matmul(self.layer4_output, self.W_out) + self.b_out
                self.se = 0.5 * tf.square(self.layer_out_output - self.output, name='square')
                self.loss = tf.reduce_mean(self.se)
    
        def _create_optimizer(self):
            """ define optimizer """
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss, 
                                                          global_step=self.global_step)
    
    
        def build_graph(self):
            """ Build the graph for our model """
            self._create_placeholders()
            self._create_weights()
            self._create_loss()
            self._create_optimizer()
    

    培训代码

    def train_deep(model, batch_gen, epoch_num, batch_size, batch_num, checkpoint_dir, graph_dir, skip_step):
        print('## Training deep Model')
        start_time = time.time()
        saver = tf.train.Saver()
        batches_seen = 0
    
        config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS, log_device_placement=LOG_DEVICE_PLACEMENT)
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(os.path.dirname(checkpoint_dir))
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                # create folder to store model weight
                os.system('mkdir {}'.format(checkpoint_dir))
    
            # writer = tf.summary.FileWriter(graph_dir, sess.graph)
            batches_seen = model.global_step.eval()
            current_epoch = int(batches_seen / batch_num)
    
            feed_dict={}
            for prob in zip(model.keep_prob_lst, model.keep_prob_lst_val):
                        feed_dict[prob[0]] = prob[1] 
    
            for epoch in xrange(current_epoch, epoch_num):
                total_loss = 0.0 
                epoch_time = time.time()
                for batch in xrange(batch_num):
                    X_batch, Y_batch= batch_gen.next()
                    feed_dict[model.input] = X_batch
                    feed_dict[model.output] = Y_batch        
                    loss_batch, _ = sess.run([model.loss, model.optimizer], 
                                                                        feed_dict=feed_dict)
                    # writer.add_summary(summary, global_step=model.global_step.eval())
                    total_loss += loss_batch
    
                # f.write('# Epoch {}  average loss and time : {:.7f}, {:.1f} seconds\n'.format(epoch, total_loss / batch_num, time.time() - epoch_time))
                print ('# Epoch {}  average loss and time : {:.7f}, {:.1f} seconds'.format(epoch, total_loss / batch_num, time.time() - epoch_time))
    
                if (epoch + 1) % skip_step == 0:
                    saver.save(sess, checkpoint_dir + 'epoch', epoch, write_meta_graph = False)
            # writer.close()
            print('## Optimization Finished!')
            print('## Total time: {:.1f} seconds'.format(time.time() - start_time))
    

    培训时间的总损失

    # Epoch 0  average loss and time : 0.0247146, 196.3 seconds
    # Epoch 1  average loss and time : 0.0044747, 196.2 seconds
    # Epoch 2  average loss and time : 0.0047277, 186.7 seconds
    # Epoch 3  average loss and time : 0.0044042, 188.7 seconds
    # Epoch 4  average loss and time : 0.0044953, 188.2 seconds
    # Epoch 5  average loss and time : 0.0044432, 194.8 seconds
    # Epoch 6  average loss and time : 0.0045212, 194.8 seconds
    # Epoch 7  average loss and time : 0.0045521, 187.3 seconds
    # Epoch 8  average loss and time : 0.0047246, 174.0 seconds
    # Epoch 9  average loss and time : 0.0045385, 175.2 seconds
    # Epoch 10  average loss and time : 0.0045456, 181.9 seconds
    # Epoch 11  average loss and time : 0.0045506, 169.9 seconds
    # Epoch 12  average loss and time : 0.0045853, 172.5 seconds
    # Epoch 13  average loss and time : 0.0045306, 171.0 seconds
    # Epoch 14  average loss and time : 0.0044970, 171.2 seconds
    # Epoch 15  average loss and time : 0.0045254, 171.5 seconds
    # Epoch 16  average loss and time : 0.0044871, 169.4 seconds
    # Epoch 17  average loss and time : 0.0045064, 170.4 seconds
    # Epoch 18  average loss and time : 0.0045122, 194.6 seconds
    # Epoch 19  average loss and time : 0.0045325, 171.7 seconds
    # Epoch 20  average loss and time : 0.0044945, 194.5 seconds
    # Epoch 21  average loss and time : 0.0045799, 194.8 seconds
    # Epoch 22  average loss and time : 0.0044225, 171.6 seconds
    # Epoch 23  average loss and time : 0.0045016, 171.3 seconds
    # Epoch 24  average loss and time : 0.0044856, 172.7 seconds
    # Epoch 25  average loss and time : 0.0044534, 173.9 seconds
    # Epoch 26  average loss and time : 0.0044783, 193.3 seconds
    # Epoch 27  average loss and time : 0.0044508, 191.4 seconds
    # Epoch 28  average loss and time : 0.0044432, 170.9 seconds
    # Epoch 29  average loss and time : 0.0044638, 173.0 seconds
    # Epoch 30  average loss and time : 0.0044592, 172.8 seconds
    # Epoch 31  average loss and time : 0.0043936, 174.0 seconds
    # Epoch 32  average loss and time : 0.0044174, 172.7 seconds
    # Epoch 33  average loss and time : 0.0044253, 172.1 seconds
    # Epoch 34  average loss and time : 0.0043917, 171.7 seconds
    # Epoch 35  average loss and time : 0.0044062, 171.9 seconds
    # Epoch 36  average loss and time : 0.0044309, 171.7 seconds
    # Epoch 37  average loss and time : 0.0043879, 174.5 seconds
    # Epoch 38  average loss and time : 0.0043931, 186.0 seconds
    # Epoch 39  average loss and time : 95.0471165, 171.2 seconds
    # Epoch 40  average loss and time : 0.0129813, 171.2 seconds
    # Epoch 41  average loss and time : 0.0115643, 171.3 seconds
    # Epoch 42  average loss and time : 0.0106565, 172.0 seconds
    # Epoch 43  average loss and time : 0.0098280, 172.5 seconds
    # Epoch 44  average loss and time : 0.0090725, 171.9 seconds
    # Epoch 45  average loss and time : 0.0083836, 171.5 seconds
    # Epoch 46  average loss and time : 0.0077587, 170.9 seconds
    # Epoch 47  average loss and time : 0.0071937, 170.8 seconds
    # Epoch 48  average loss and time : 0.0066826, 171.4 seconds
    # Epoch 49  average loss and time : 0.0062041, 178.2 seconds
    # Epoch 50  average loss and time : 0.0057936, 182.3 seconds
    # Epoch 51  average loss and time : 0.0054678, 195.1 seconds
    # Epoch 52  average loss and time : 0.0053669, 171.2 seconds
    # Epoch 53  average loss and time : 0.0051792, 172.1 seconds
    # Epoch 54  average loss and time : 0.0051216, 194.4 seconds
    # Epoch 55  average loss and time : 0.0050883, 194.8 seconds
    # Epoch 56  average loss and time : 0.0049717, 171.6 seconds
    # Epoch 57  average loss and time : 0.0049208, 186.9 seconds
    # Epoch 58  average loss and time : 0.0050312, 170.8 seconds
    # Epoch 59  average loss and time : 0.0048330, 170.5 seconds
    # Epoch 60  average loss and time : 0.0049766, 170.1 seconds
    # Epoch 61  average loss and time : 0.0049494, 171.2 seconds
    # Epoch 62  average loss and time : 0.0049745, 171.1 seconds
    # Epoch 63  average loss and time : 0.0052638, 171.4 seconds
    # Epoch 64  average loss and tiI tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 980, pci bus id: 0000:89:00.0)
    I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 980, pci bus id: 0000:89:00.0)
    I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 980, pci bus id: 0000:89:00.0)
    me : 0.0044675, 171.4 seconds
    # Epoch 65  average loss and time : 0.0051712, 171.2 seconds
    # Epoch 66  average loss and time : 0.0045092, 171.4 seconds
    # Epoch 67  average loss and time : 0.0049986, 171.4 seconds
    # Epoch 68  average loss and time : 0.0050070, 171.2 seconds
    # Epoch 69  average loss and time : 0.0047540, 170.0 seconds
    # Epoch 70  average loss and time : 0.0051099, 194.4 seconds
    # Epoch 71  average loss and time : 0.0044266, 171.3 seconds
    # Epoch 72  average loss and time : 0.0052696, 193.5 seconds
    # Epoch 73  average loss and time : 0.0044099, 171.9 seconds
    # Epoch 74  average loss and time : 0.0054506, 194.5 seconds
    # Epoch 75  average loss and time : 0.0044097, 194.8 seconds
    # Epoch 76  average loss and time : 0.0048847, 193.7 seconds
    # Epoch 77  average loss and time : 0.0044050, 176.3 seconds
    # Epoch 78  average loss and time : 0.0050718, 194.3 seconds
    # Epoch 79  average loss and time : 0.0043967, 171.6 seconds
    # Epoch 80  average loss and time : 0.0045824, 188.8 seconds
    # Epoch 81  average loss and time : 0.0049379, 172.6 seconds
    # Epoch 82  average loss and time : 0.0047899, 194.5 seconds
    # Epoch 83  average loss and time : 0.0045796, 193.2 seconds
    # Epoch 84  average loss and time : 0.0046792, 195.2 seconds
    # Epoch 85  average loss and time : 0.0049916, 195.4 seconds
    # Epoch 86  average loss and time : 0.0044301, 194.1 seconds
    # Epoch 87  average loss and time : 0.0047661, 170.6 seconds
    # Epoch 88  average loss and time : 0.0047771, 173.3 seconds
    # Epoch 89  average loss and time : 0.0044409, 172.3 seconds
    # Epoch 90  average loss and time : 0.0045383, 176.0 seconds
    # Epoch 91  average loss and time : 0.0046460, 171.4 seconds
    # Epoch 92  average loss and time : 0.0046248, 194.0 seconds
    # Epoch 93  average loss and time : 0.0045983, 195.3 seconds
    # Epoch 94  average loss and time : 0.0046806, 172.9 seconds
    # Epoch 95  average loss and time : 0.0045397, 187.3 seconds
    # Epoch 96  average loss and time : 0.0044473, 171.4 seconds
    # Epoch 97  average loss and time : 0.0047241, 171.9 seconds
    # Epoch 98  average loss and time : 0.0045720, 171.3 seconds
    # Epoch 99  average loss and time : 0.0045641, 171.3 seconds
    

    有谁知道为什么会这样?

    我该如何调试?手动检查重量?

0 个答案:

没有答案