解决了,我有更多的目标输出神经元。
我相信出现了问题 - 现在我认为它是张量流的sparse_softmax_cross....
函数( - 之前:我的数据)。我写了一个简单的程序(3个lyr网络)来试着看看发生了什么:
import tensorflow as tf
import h5py
import numpy as np
def ini_weight_var(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=(2.0/shape[0])**0.5))
def ini_bias_var(shape):
return tf.Variable(tf.constant(0.0, shape=shape))
input_dim = 1287
input_lyr = tf.placeholder(tf.float32, shape=[None, input_dim ])
targs = tf.placeholder(tf.int64, shape=[None,])
w1 = ini_weight_var([input_dim ,2048])
b1 = ini_bias_var([2048])
a1 = tf.nn.elu(tf.matmul(input_lyr, w1) + b1)
w2 = ini_weight_var([2048, 2023])
b2 = ini_bias_var([2023])
out = tf.matmul(a1, w2) + b2
out_sm = tf.nn.softmax(out)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(out, targs))
get_acc = tf.reduce_mean(tf.cast(tf.equal( tf.argmax(out_sm, 1), targs) ,"float"))
train_adam = tf.train.AdamOptimizer(1e-10).minimize(cost)
d = h5py.File('data-amfb/amfb_train_subset.hdf5','r')
# shapes: f==(1000,1287); t==(1000,) -- types: f.dtype==np.float32; t.dtype==np.int64
f, t = d['feats'][()], d['targs'][()]
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
for i in range(2):
logit = sess.run(out, feed_dict={input_lyr: f})
sm_res = sess.run(out_sm, feed_dict={input_lyr: f})
print("is 0 in logit: {}".format(bool(np.sum(logit==0)))) # For Da Tong.
print("Smallest sm outputs: {}".format(np.sort(np.min(sm_res, axis=1))[:20]))
print("is logit with Nan: {}".format(bool(np.sum(np.isnan(logit)))))
print("is sm with Nan: {}".format(bool(np.sum(np.isnan(sm_res)))))
print "Performing backprop."
sess.run(train_adam, feed_dict={input_lyr: f, targs: t})
print("Score: {}".format(sess.run(get_acc, feed_dict={input_lyr: f, targs: t})))
print("Cost: {}".format(sess.run(cost, feed_dict={input_lyr: f, targs: t})))
print
这里的输出(softmax值显然不是太小):
is 0 in logit: False
Smallest sm outputs: [ 4.24013763e-07 6.52373728e-07 7.47293313e-07 8.81291555e-07
1.04935509e-06 1.05191452e-06 1.07265134e-06 1.08382812e-06
1.11767417e-06 1.13135320e-06 1.16807541e-06 1.17315426e-06
1.17497621e-06 1.20026777e-06 1.20443030e-06 1.20840150e-06
1.21510743e-06 1.22574534e-06 1.24325027e-06 1.24601388e-06]
is logit with Nan: False
is sm with Nan: False
Performing backprop.
Score: 0.0340000018477
Cost: nan
is 0 in logit: False
Smallest sm outputs: [ nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan]
is logit with Nan: True
is sm with Nan: True
Performing backprop.
Score: 0.0340000018477
Cost: nan
我检查了np.sum(np.isnan(w1).astype(np.int32))
和np.sum(np.isnan(f).astype(np.int32))
,它们都等于0.所以我的数据或权重矩阵中没有NaN。数据的平均值为0和stddev 1.
(因为改变了我的想法)我认为问题与我的数据有关,因为只有在这个问题发生时才会出现这个问题。如果我使用不同的数据(其他功能类型);没有。
我已经尝试了所有明显的float32 / 64和int32 / 64变体。我已经为epsilon
尝试了各种学习率和价值观。我也试过不同的重量。
现在我没有想法。
编辑:我在训练了一个纪元后打印出隐藏层激活,这导致了NaNs。如果我在一切正常之前打印出来。所以tf.nn.sparse_softmax_cross_entropy_with_logits
会出现问题。
我的logits非常小。不知道这是否意味着什么。 Softmax应该使一切正常化。