Question

我有少量数据，包含8个要素和5个值，可通过回归进行预测。示例数据显示在问题的末尾。

当我运行深度网络的训练时，经过大约400次迭代后，我得到权重的NaN值，而成本函数也导致NaN值，这意味着训练不起作用。为什么是这样？我对TensorFlow很新，我觉得有一些明显的遗漏，可能在成本计算中：

#!/usr/bin/env python

"""
usage:
    program [options]

options:
    -h, --help  display help message
    --version   display version and exit
"""

import docopt
import subprocess

import numpy as np
import tensorflow as tf

def main(options):

    # configuration
    number_classes = 5
    epochs         = 10000001
    learning_rate  = 0.1
    logs_path      = "/tmp/run"

    tf.reset_default_graph()

    # TensorBoard
    subprocess.Popen(["killall tensorboard"],            shell = True)
    subprocess.Popen(["rm -rf /tmp/run"],                shell = True)
    subprocess.Popen(["tensorboard --logdir=/tmp/run"],  shell = True)
    subprocess.Popen(["xdg-open http://127.0.1.1:6006"], shell = True)

    data = np.loadtxt(
        "data.csv",
        skiprows  = 1,
        delimiter = ",",
        dtype     = np.float32
    )

    x_data = data[:, 0:- number_classes]
    y_data = data[:, number_classes + 3:]

    with tf.name_scope("input"):
        X          = tf.placeholder(tf.float32, [None, x_data.shape[1]])
        Y          = tf.placeholder(tf.float32, [None, y_data.shape[1]])
    tf.summary.histogram("input", X)

    with tf.name_scope("architecture"):
        W1         = tf.Variable(tf.random_normal([x_data.shape[1], 50]),  name = "weight1")
        b1         = tf.Variable(tf.random_normal([50]),                   name = "bias1"  )
        layer1     = tf.sigmoid(tf.matmul(X, W1) + b1)

        W2         = tf.Variable(tf.random_normal([50, 50]),               name = "weight2")
        b2         = tf.Variable(tf.random_normal([50]),                   name = "bias2"  )
        layer2     = tf.sigmoid(tf.matmul(layer1, W2) + b2)

        W3         = tf.Variable(tf.random_normal([50, 50]),               name = "weight3")
        b3         = tf.Variable(tf.random_normal([50]),                   name = "bias3"  )
        layer3     = tf.sigmoid(tf.matmul(layer2, W3) + b3)

        W4         = tf.Variable(tf.random_normal([50, y_data.shape[1]]),  name = "weight4")
        b4         = tf.Variable(tf.random_normal([y_data.shape[1]]),      name = "bias4"  )
        hypothesis = tf.sigmoid(tf.matmul(layer3, W4) + b4)
    #tf.summary.histogram("W1", W1)
    #tf.summary.histogram("b1", b1)
    #tf.summary.histogram("layer1", layer1)
    #tf.summary.histogram("W2", W2)
    #tf.summary.histogram("b2", b2)
    #tf.summary.histogram("layer2", layer2)
    #tf.summary.histogram("W3", W3)
    #tf.summary.histogram("b3", b3)
    #tf.summary.histogram("layer3", layer3)
    #tf.summary.histogram("W4", W4)
    #tf.summary.histogram("b4", b4)
    #tf.summary.histogram("hypothesis", hypothesis)

    with tf.name_scope("cost"):
        cost       = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))
        train      = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cost)
    tf.summary.scalar("cost", cost)

    with tf.name_scope("accuracy"):
        # accuracy computation: true if hypothesis > 0.5 else false
        predicted  = tf.cast(hypothesis > 0.5, dtype = tf.float32)
        accuracy   = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype = tf.float32))
    tf.summary.scalar("accuracy", accuracy)

    summary_operation = tf.summary.merge_all()

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())

        writer = tf.summary.FileWriter(logs_path)

        for step in range(epochs):

            _, summary = sess.run([train, summary_operation], feed_dict = {X: x_data, Y: y_data})

            writer.add_summary(summary, step)

            if step % 2000 == 0:
                print("\nstep: {step}\ncost: {cost}".format(
                    step = step,
                    cost = sess.run(cost, feed_dict = {X: x_data, Y: y_data})
                ))

        print("\naccuracy report:")
        h, c, a = sess.run([hypothesis, predicted, accuracy], feed_dict = {X: x_data, Y: y_data})
        print("\nhypothesis:\n\n{hypothesis}\n\ncorrect (Y):\n\n{correct}\n\naccuracy: {accuracy}".format(
            hypothesis = h,
            correct    = c,
            accuracy   = a
        ))

    subprocess.Popen(["killall tensorboard"],            shell = True)

if __name__ == "__main__":

    options = docopt.docopt(__doc__)
    if options["--version"]:
        print(version)
        exit()
    main(options)

示例数据（data.csv）如下：

i1,i2,i3,i4,i5,i6,i7,i8,o1,o2,o3,o4,o5
-1,-0.5352926315,-0.4935420352,-1,-0.4944026038,-0.253963208,-1,-0.8880478088,0.8546009151,0.4774468085,-0.062295082,-0.6523892052,-0.4402645721
-0.5522903811,-0.7816838836,-0.7696659213,-0.2629286881,-0.3547593658,-1,0.089373879,-0.4944223108,-0.8190137265,-0.1268085106,-0.3398907104,-0.760336717,-0.6254650682
-0.0890453721,-0.7323068451,-0.7808130027,1,-1,0.1459624163,-0.0262982891,1,0.4112862227,0.8110638298,0.5038251366,0.6484278287,0.8983050847
-0.4235644283,1,1,-0.7711244178,1,0.577533724,0.3119722709,-0.9884462151,-1,-1,-1,-0.9767269126,-1
0.8491470054,-0.2859093477,-0.3093481961,0.3165184782,-0.1465875018,-0.2885239412,-0.5480321625,-0.7645418327,0.649211998,0.130212766,-0.0754098361,-0.7865808368,-0.6494419181
1,-0.2139813608,-0.2666844664,0.8739490716,-0.85062216,1,0.4008423513,-0.1756972112,0.6807320793,0.050212766,1,1,1
-0.2383303085,-1,-1,0.0974414807,-0.824866335,-0.7023770251,-0.1418898495,-0.2003984064,0.8403660397,0.1234042553,0.2480874317,-0.3265659817,-0.2029764365
-0.4014083485,0.050288274,0.0192515213,0.3188169116,-0.3785831324,0.7963543885,-0.7363722467,-1,0.3523131673,1,0.1781420765,-1,-0.9859446052
0.2686170599,-0.1839134298,-0.2154657411,0.3874070042,-0.7997421445,0.9813940655,1,0.1442231076,1,0.8144680851,0.3333333333,0.1562267888,0.2740801984
0.8191506352,-0.6580510916,-0.6434992046,-0.3727696123,-0.7772633879,-0.056711512,0.2537734518,0.9589641434,0.6573462125,-0.3446808511,-0.0207650273,0.1606833375,0.2740801984

为什么这个TensorFlow深度网络的成本函数会产生NaN？

0 个答案: