多单元LSTM RNN返回nan训练错误

时间:2020-02-19 11:04:00

标签: python tensorflow lstm recurrent-neural-network

我正在尝试训练此多单元RNN网络(对于训练,您可以忽略m_t + 1-> m_t部分)

enter image description here

使用4个LSTM单元层。编码器和解码器只是完全连接的层。 G_t和m_t分别是大小为6、69的浮点组。 P_t,m_t + 1也是如此。此RNN的时间步长为48。 但是出于某种原因,我的培训根本无法进行。我很想知道我的代码出了什么问题。

成本函数如下所示

enter image description here

n_steps = 48
n_neurons = 512
n_layers = 4
NUM_OF_INPUTS = 6 + 69
NUM_OF_OUTPUTS = 6 + 69
EPOCHS = 50
sample_size = 12494
batch_size = 128
total_batch = int(sample_size / batch_size)
global_step = tf.Variable(0, trainable=False)
prop_valid = 0.1
time_stamp = 48

def mini_batch(data, bs, i):
    return data[i*bs : i*bs+bs,:,:]

#Both X_data_np and Y_data_np are three dimensional, which is the required dimension for the inputs of tf.nn.dynamic_rnn
X_data_np = np.load('X_data.npy')
Y_data_np = np.load('Y_data.npy')
data = np.concatenate([X_data_np, Y_data_np], axis=-1)
np.random.shuffle(data)
#standardize data
mean = np.mean(data)
data = data - mean
std = np.std(data)
data = data / std

train_size = int(sample_size * (1 - prop_valid))
valid_size = int((sample_size - train_size))

train_input = data[:train_size, :, :NUM_OF_INPUTS]
train_label = data[:train_size, :, NUM_OF_INPUTS:]
valid_input = data[train_size:train_size + valid_size, :,:NUM_OF_INPUTS]
valid_label = data[train_size:train_size + valid_size, :,NUM_OF_INPUTS:]

X = tf.placeholder(tf.float32, [None, n_steps, NUM_OF_INPUTS])
Y = tf.placeholder(tf.float32, [None, n_steps, NUM_OF_OUTPUTS])
encoded_inputs = tf.layers.dense(X, 256)
layers = [tf.contrib.rnn.LSTMCell(num_units = n_neurons, activation=tf.nn.tanh) for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
outputs, _ = tf.nn.dynamic_rnn(multi_layer_cell, encoded_inputs, dtype=tf.float32)
prediction = tf.layers.dense(outputs, NUM_OF_OUTPUTS) 

Y = tf.placeholder(tf.float32, [None, n_steps, NUM_OF_OUTPUTS]) #(?, 48, 75)
distance = tf.norm(prediction[:,:,6:75] - Y[:,:,6:75], axis = 2)  # (?, 48)
distance_square = tf.square(distance)
#Add all the sum
reduced_distance = tf.math.reduce_sum(distance_square, axis= 1)  # (?, )
#Mean of all mini batch data
train_loss = tf.math.reduce_mean(reduced_distance, axis= 0) # ()

learning_rate = 0.001
trainOptimizer = tf.train.AdamOptimizer(learning_rate).minimize(train_loss, global_step=global_step)

sess = tf.Session()
tf.global_variables_initializer().run(session=sess)

for epoch in range(EPOCHS):
    for batch_idx in range(total_batch):
        train_batch_input = mini_batch(train_input, batch_size, batch_idx)
        train_batch_label = mini_batch(train_label, batch_size, batch_idx)
        _, loss= sess.run([trainOptimizer, train_loss], feed_dict={X:train_batch_input,Y:train_batch_label})
    if (epoch+1) % 10 == 0:
        prediction2 = sess.run(prediction, feed_dict={X:valid_input})
        valid_error = np.mean(np.sum(np.square(np.linalg.norm(prediction2[:,:,6:75] - valid_label[:,:,6:75], axis = 2)), axis = 1), axis = 0)
        print("Epoch: %05d tL: %.4f vE: %.4f" % (epoch+1, loss, valid_error))

结果如下

Epoch: 00010 tL: nan vE: 4.3044
Epoch: 00020 tL: nan vE: 4.3114
Epoch: 00030 tL: nan vE: 4.2962
Epoch: 00040 tL: nan vE: 4.3009
Epoch: 00050 tL: nan vE: 4.2899

无论训练数据有多小,训练损失始终是微不足道的,因此我认为根本的问题在于我在哪里进行训练。验证错误不是nan,所以我想数据本身不包含nan。 我没有在代码中解决一个关键问题吗?任何帮助,将不胜感激!预先感谢。

1 个答案:

答案 0 :(得分:0)

之所以验证错误显示正常值而训练错误没有显示原因,是因为我正在制作具有nan值的小型批次。

显然

Option Explicit

Private Sub CommandButton1_Click()



    Dim MyFolder As String, MyFile As String

    Dim StartTime As Double

    Dim MinutesElapsed As String

    Dim Filename As String

    Dim Cell As String

    Dim Counter As Long



    If ThisWorkbook.Sheets("Sheet1").Range("C7").Value = vbNullString Then

        MsgBox "Enter Tab Name"

        Exit Sub



    End If


    StartTime = Timer



    With Application.FileDialog(msoFileDialogFolderPicker)

        .AllowMultiSelect = False

        .Title = "Select a Folder"

        If .Show = True Then

            MyFolder = .SelectedItems(1)

        End If



        If .SelectedItems.Count = 0 Then Exit Sub

        Err.Clear

    End With


    'Turns settings off
    Application.ScreenUpdating = False

    Application.DisplayStatusBar = False

    Application.EnableEvents = False

    Application.Calculation = xlCalculationManual


    MyFile = Dir(MyFolder & "\", vbReadOnly)


    Do While MyFile <> ""

        DoEvents

        On Error GoTo 0

        Workbooks.Open Filename:=MyFolder & "\" & MyFile, UpdateLinks:=False


        Dim ReportSheet As Worksheet

        Dim MySheet As String

        Dim allColumns As Range



        MySheet = ThisWorkbook.Sheets("Sheet1").Range("C7").Value



        Set ReportSheet = Sheets(MySheet)

        Set allColumns = ReportSheet.Columns("N:S")

        allColumns.Hidden = True



        With ReportSheet.PageSetup

            .Zoom = False

            .FitToPagesWide = 1                  '.FitToPagesTall = 1

        End With


        Filename = ActiveWorkbook.Name



        Cell = Replace(Filename, ".xlsx", ".PDF")

        ReportSheet.Select



        ReportSheet.PageSetup.Orientation = xlLandscape


        ReportSheet.ExportAsFixedFormat Type:=xlTypePDF, Filename:=ThisWorkbook.Path & "\" & Cell, _
                                        Quality:=xlQualityStandard, IncludeDocProperties:=True, _
                                        IgnorePrintAreas:=True, OpenAfterPublish:=False



        Counter = Counter + 1



        Workbooks(MyFile).Close SaveChanges:=False

        MyFile = Dir

    Loop


    'turns settings back on that you turned off before looping folders



    Application.ScreenUpdating = True

    Application.DisplayStatusBar = True

    Application.EnableEvents = True

    Application.Calculation = xlCalculationManual


    MinutesElapsed = Format((Timer - StartTime) / 86400, "hh:mm:ss")

    MsgBox "Successfully Converted " & Counter & " Files in " & MinutesElapsed & " minutes", vbInformation



End Sub

sample_size = 12494
batch_size = 128
total_batch = int(sample_size / batch_size)
train_size = int(sample_size * (1 - prop_valid))

没有道理。 for batch_idx in range(total_batch): train_batch_input = mini_batch(train_input, batch_size, batch_idx) train_batch_label = mini_batch(train_label, batch_size, batch_idx) 应该是total_batch

真的很难找到原因,当数组切片超出范围时numpy不会返回任何错误。

无论如何,希望它能在将来帮助类似问题的人!