您好我使用Tensorflow进行机器学习任务(回归)
我使用普通神经网络并使用ELU作为激活功能。
但是每当我使用ELU作为激活功能时,突然损失会在某个时刻增加。
使用sigmoid激活,这不会发生。
设置
整个代码并在下面记录!
型号代码
class DeepModel:
def __init__(self, learning_rate, batch_size, neighbor, weight_decay = 0.9, huber_delta=0.3, keep_prob_lst=[]):
""" hyperparameters """
self.isConv = False
self.batch_size = batch_size
self.lr = learning_rate
self.input_size = neighbor * 3
self.output_size = 1
self.weight_decay = weight_decay
self.layer1_size = 300
self.layer2_size = 300
self.layer3_size = 300
self.layer4_size = 300
self.huber_delta = huber_delta
self.keep_prob_lst_val = keep_prob_lst
self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
def _create_placeholders(self):
""" define the placeholders for input and output """
with tf.name_scope("data"):
self.input = tf.placeholder(tf.float32, shape = [self.batch_size, self.input_size], name='input')
self.output = tf.placeholder(tf.float32, shape= [self.batch_size, self.output_size], name='output')
with tf.name_scope("keep_prob"):
self.keep_prob_lst = [ tf.placeholder(tf.float32, name='layer{}_keep_prob'.format(idx+1)) for idx in range(len(self.keep_prob_lst_val))]
def _create_weights(self):
""" define weights. """
# Assemble this part of the graph on the CPU. You can change it to GPU if you have GPU
with tf.name_scope("weights"):
self.W_1 = tf.Variable(tf.random_normal([self.input_size, self.layer1_size], stddev=0.01, mean=0.0, seed=0), name='layer1_weight')
self.b_1 = tf.Variable(tf.zeros([1,self.layer1_size]), name='layer1_bias')
self.W_2 = tf.Variable(tf.random_normal([self.layer1_size, self.layer2_size], stddev=0.01, mean=0.0, seed=0), name='layer2_weight')
self.b_2 = tf.Variable(tf.zeros([1,self.layer2_size]), name='layer2_bias')
self.W_3 = tf.Variable(tf.random_normal([self.layer2_size, self.layer3_size], stddev=0.01, mean=0.0, seed=0), name='layer2_weight')
self.b_3 = tf.Variable(tf.zeros([1,self.layer3_size]), name='layer2_bias')
self.W_4 = tf.Variable(tf.random_normal([self.layer3_size, self.layer4_size], stddev=0.01, mean=0.0, seed=0), name='layer2_weight')
self.b_4 = tf.Variable(tf.zeros([1,self.layer4_size]), name='layer2_bias')
self.W_out = tf.Variable(tf.random_normal([self.layer4_size, self.output_size], stddev=0.01, mean=0.0, seed=0), name='layer_out_weight')
self.b_out = tf.Variable(tf.zeros([1,self.output_size]), name='layer_out_bias')
def _create_loss(self):
""" define the inference + the loss function """
with tf.name_scope("loss"):
self.layer1_output = tf.nn.elu(tf.matmul(self.input, self.W_1) + self.b_1)
self.layer2_output = tf.nn.elu(tf.matmul(self.layer1_output, self.W_2) + self.b_2)
self.layer3_output = tf.nn.elu(tf.matmul(self.layer2_output, self.W_3) + self.b_3)
self.layer4_output = tf.nn.elu(tf.matmul(self.layer3_output, self.W_4) + self.b_4)
self.layer_out_output = tf.matmul(self.layer4_output, self.W_out) + self.b_out
self.se = 0.5 * tf.square(self.layer_out_output - self.output, name='square')
self.loss = tf.reduce_mean(self.se)
def _create_optimizer(self):
""" define optimizer """
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss,
global_step=self.global_step)
def build_graph(self):
""" Build the graph for our model """
self._create_placeholders()
self._create_weights()
self._create_loss()
self._create_optimizer()
培训代码
def train_deep(model, batch_gen, epoch_num, batch_size, batch_num, checkpoint_dir, graph_dir, skip_step):
print('## Training deep Model')
start_time = time.time()
saver = tf.train.Saver()
batches_seen = 0
config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS, log_device_placement=LOG_DEVICE_PLACEMENT)
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
ckpt = tf.train.get_checkpoint_state(os.path.dirname(checkpoint_dir))
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
else:
# create folder to store model weight
os.system('mkdir {}'.format(checkpoint_dir))
# writer = tf.summary.FileWriter(graph_dir, sess.graph)
batches_seen = model.global_step.eval()
current_epoch = int(batches_seen / batch_num)
feed_dict={}
for prob in zip(model.keep_prob_lst, model.keep_prob_lst_val):
feed_dict[prob[0]] = prob[1]
for epoch in xrange(current_epoch, epoch_num):
total_loss = 0.0
epoch_time = time.time()
for batch in xrange(batch_num):
X_batch, Y_batch= batch_gen.next()
feed_dict[model.input] = X_batch
feed_dict[model.output] = Y_batch
loss_batch, _ = sess.run([model.loss, model.optimizer],
feed_dict=feed_dict)
# writer.add_summary(summary, global_step=model.global_step.eval())
total_loss += loss_batch
# f.write('# Epoch {} average loss and time : {:.7f}, {:.1f} seconds\n'.format(epoch, total_loss / batch_num, time.time() - epoch_time))
print ('# Epoch {} average loss and time : {:.7f}, {:.1f} seconds'.format(epoch, total_loss / batch_num, time.time() - epoch_time))
if (epoch + 1) % skip_step == 0:
saver.save(sess, checkpoint_dir + 'epoch', epoch, write_meta_graph = False)
# writer.close()
print('## Optimization Finished!')
print('## Total time: {:.1f} seconds'.format(time.time() - start_time))
培训时间的总损失
# Epoch 0 average loss and time : 0.0247146, 196.3 seconds
# Epoch 1 average loss and time : 0.0044747, 196.2 seconds
# Epoch 2 average loss and time : 0.0047277, 186.7 seconds
# Epoch 3 average loss and time : 0.0044042, 188.7 seconds
# Epoch 4 average loss and time : 0.0044953, 188.2 seconds
# Epoch 5 average loss and time : 0.0044432, 194.8 seconds
# Epoch 6 average loss and time : 0.0045212, 194.8 seconds
# Epoch 7 average loss and time : 0.0045521, 187.3 seconds
# Epoch 8 average loss and time : 0.0047246, 174.0 seconds
# Epoch 9 average loss and time : 0.0045385, 175.2 seconds
# Epoch 10 average loss and time : 0.0045456, 181.9 seconds
# Epoch 11 average loss and time : 0.0045506, 169.9 seconds
# Epoch 12 average loss and time : 0.0045853, 172.5 seconds
# Epoch 13 average loss and time : 0.0045306, 171.0 seconds
# Epoch 14 average loss and time : 0.0044970, 171.2 seconds
# Epoch 15 average loss and time : 0.0045254, 171.5 seconds
# Epoch 16 average loss and time : 0.0044871, 169.4 seconds
# Epoch 17 average loss and time : 0.0045064, 170.4 seconds
# Epoch 18 average loss and time : 0.0045122, 194.6 seconds
# Epoch 19 average loss and time : 0.0045325, 171.7 seconds
# Epoch 20 average loss and time : 0.0044945, 194.5 seconds
# Epoch 21 average loss and time : 0.0045799, 194.8 seconds
# Epoch 22 average loss and time : 0.0044225, 171.6 seconds
# Epoch 23 average loss and time : 0.0045016, 171.3 seconds
# Epoch 24 average loss and time : 0.0044856, 172.7 seconds
# Epoch 25 average loss and time : 0.0044534, 173.9 seconds
# Epoch 26 average loss and time : 0.0044783, 193.3 seconds
# Epoch 27 average loss and time : 0.0044508, 191.4 seconds
# Epoch 28 average loss and time : 0.0044432, 170.9 seconds
# Epoch 29 average loss and time : 0.0044638, 173.0 seconds
# Epoch 30 average loss and time : 0.0044592, 172.8 seconds
# Epoch 31 average loss and time : 0.0043936, 174.0 seconds
# Epoch 32 average loss and time : 0.0044174, 172.7 seconds
# Epoch 33 average loss and time : 0.0044253, 172.1 seconds
# Epoch 34 average loss and time : 0.0043917, 171.7 seconds
# Epoch 35 average loss and time : 0.0044062, 171.9 seconds
# Epoch 36 average loss and time : 0.0044309, 171.7 seconds
# Epoch 37 average loss and time : 0.0043879, 174.5 seconds
# Epoch 38 average loss and time : 0.0043931, 186.0 seconds
# Epoch 39 average loss and time : 95.0471165, 171.2 seconds
# Epoch 40 average loss and time : 0.0129813, 171.2 seconds
# Epoch 41 average loss and time : 0.0115643, 171.3 seconds
# Epoch 42 average loss and time : 0.0106565, 172.0 seconds
# Epoch 43 average loss and time : 0.0098280, 172.5 seconds
# Epoch 44 average loss and time : 0.0090725, 171.9 seconds
# Epoch 45 average loss and time : 0.0083836, 171.5 seconds
# Epoch 46 average loss and time : 0.0077587, 170.9 seconds
# Epoch 47 average loss and time : 0.0071937, 170.8 seconds
# Epoch 48 average loss and time : 0.0066826, 171.4 seconds
# Epoch 49 average loss and time : 0.0062041, 178.2 seconds
# Epoch 50 average loss and time : 0.0057936, 182.3 seconds
# Epoch 51 average loss and time : 0.0054678, 195.1 seconds
# Epoch 52 average loss and time : 0.0053669, 171.2 seconds
# Epoch 53 average loss and time : 0.0051792, 172.1 seconds
# Epoch 54 average loss and time : 0.0051216, 194.4 seconds
# Epoch 55 average loss and time : 0.0050883, 194.8 seconds
# Epoch 56 average loss and time : 0.0049717, 171.6 seconds
# Epoch 57 average loss and time : 0.0049208, 186.9 seconds
# Epoch 58 average loss and time : 0.0050312, 170.8 seconds
# Epoch 59 average loss and time : 0.0048330, 170.5 seconds
# Epoch 60 average loss and time : 0.0049766, 170.1 seconds
# Epoch 61 average loss and time : 0.0049494, 171.2 seconds
# Epoch 62 average loss and time : 0.0049745, 171.1 seconds
# Epoch 63 average loss and time : 0.0052638, 171.4 seconds
# Epoch 64 average loss and tiI tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 980, pci bus id: 0000:89:00.0)
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 980, pci bus id: 0000:89:00.0)
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 980, pci bus id: 0000:89:00.0)
me : 0.0044675, 171.4 seconds
# Epoch 65 average loss and time : 0.0051712, 171.2 seconds
# Epoch 66 average loss and time : 0.0045092, 171.4 seconds
# Epoch 67 average loss and time : 0.0049986, 171.4 seconds
# Epoch 68 average loss and time : 0.0050070, 171.2 seconds
# Epoch 69 average loss and time : 0.0047540, 170.0 seconds
# Epoch 70 average loss and time : 0.0051099, 194.4 seconds
# Epoch 71 average loss and time : 0.0044266, 171.3 seconds
# Epoch 72 average loss and time : 0.0052696, 193.5 seconds
# Epoch 73 average loss and time : 0.0044099, 171.9 seconds
# Epoch 74 average loss and time : 0.0054506, 194.5 seconds
# Epoch 75 average loss and time : 0.0044097, 194.8 seconds
# Epoch 76 average loss and time : 0.0048847, 193.7 seconds
# Epoch 77 average loss and time : 0.0044050, 176.3 seconds
# Epoch 78 average loss and time : 0.0050718, 194.3 seconds
# Epoch 79 average loss and time : 0.0043967, 171.6 seconds
# Epoch 80 average loss and time : 0.0045824, 188.8 seconds
# Epoch 81 average loss and time : 0.0049379, 172.6 seconds
# Epoch 82 average loss and time : 0.0047899, 194.5 seconds
# Epoch 83 average loss and time : 0.0045796, 193.2 seconds
# Epoch 84 average loss and time : 0.0046792, 195.2 seconds
# Epoch 85 average loss and time : 0.0049916, 195.4 seconds
# Epoch 86 average loss and time : 0.0044301, 194.1 seconds
# Epoch 87 average loss and time : 0.0047661, 170.6 seconds
# Epoch 88 average loss and time : 0.0047771, 173.3 seconds
# Epoch 89 average loss and time : 0.0044409, 172.3 seconds
# Epoch 90 average loss and time : 0.0045383, 176.0 seconds
# Epoch 91 average loss and time : 0.0046460, 171.4 seconds
# Epoch 92 average loss and time : 0.0046248, 194.0 seconds
# Epoch 93 average loss and time : 0.0045983, 195.3 seconds
# Epoch 94 average loss and time : 0.0046806, 172.9 seconds
# Epoch 95 average loss and time : 0.0045397, 187.3 seconds
# Epoch 96 average loss and time : 0.0044473, 171.4 seconds
# Epoch 97 average loss and time : 0.0047241, 171.9 seconds
# Epoch 98 average loss and time : 0.0045720, 171.3 seconds
# Epoch 99 average loss and time : 0.0045641, 171.3 seconds
有谁知道为什么会这样?
我该如何调试?手动检查重量?