我正在尝试训练此多单元RNN网络(对于训练,您可以忽略m_t + 1-> m_t部分)
使用4个LSTM单元层。编码器和解码器只是完全连接的层。 G_t和m_t分别是大小为6、69的浮点组。 P_t,m_t + 1也是如此。此RNN的时间步长为48。 但是出于某种原因,我的培训根本无法进行。我很想知道我的代码出了什么问题。
成本函数如下所示
n_steps = 48
n_neurons = 512
n_layers = 4
NUM_OF_INPUTS = 6 + 69
NUM_OF_OUTPUTS = 6 + 69
EPOCHS = 50
sample_size = 12494
batch_size = 128
total_batch = int(sample_size / batch_size)
global_step = tf.Variable(0, trainable=False)
prop_valid = 0.1
time_stamp = 48
def mini_batch(data, bs, i):
return data[i*bs : i*bs+bs,:,:]
#Both X_data_np and Y_data_np are three dimensional, which is the required dimension for the inputs of tf.nn.dynamic_rnn
X_data_np = np.load('X_data.npy')
Y_data_np = np.load('Y_data.npy')
data = np.concatenate([X_data_np, Y_data_np], axis=-1)
np.random.shuffle(data)
#standardize data
mean = np.mean(data)
data = data - mean
std = np.std(data)
data = data / std
train_size = int(sample_size * (1 - prop_valid))
valid_size = int((sample_size - train_size))
train_input = data[:train_size, :, :NUM_OF_INPUTS]
train_label = data[:train_size, :, NUM_OF_INPUTS:]
valid_input = data[train_size:train_size + valid_size, :,:NUM_OF_INPUTS]
valid_label = data[train_size:train_size + valid_size, :,NUM_OF_INPUTS:]
X = tf.placeholder(tf.float32, [None, n_steps, NUM_OF_INPUTS])
Y = tf.placeholder(tf.float32, [None, n_steps, NUM_OF_OUTPUTS])
encoded_inputs = tf.layers.dense(X, 256)
layers = [tf.contrib.rnn.LSTMCell(num_units = n_neurons, activation=tf.nn.tanh) for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
outputs, _ = tf.nn.dynamic_rnn(multi_layer_cell, encoded_inputs, dtype=tf.float32)
prediction = tf.layers.dense(outputs, NUM_OF_OUTPUTS)
Y = tf.placeholder(tf.float32, [None, n_steps, NUM_OF_OUTPUTS]) #(?, 48, 75)
distance = tf.norm(prediction[:,:,6:75] - Y[:,:,6:75], axis = 2) # (?, 48)
distance_square = tf.square(distance)
#Add all the sum
reduced_distance = tf.math.reduce_sum(distance_square, axis= 1) # (?, )
#Mean of all mini batch data
train_loss = tf.math.reduce_mean(reduced_distance, axis= 0) # ()
learning_rate = 0.001
trainOptimizer = tf.train.AdamOptimizer(learning_rate).minimize(train_loss, global_step=global_step)
sess = tf.Session()
tf.global_variables_initializer().run(session=sess)
for epoch in range(EPOCHS):
for batch_idx in range(total_batch):
train_batch_input = mini_batch(train_input, batch_size, batch_idx)
train_batch_label = mini_batch(train_label, batch_size, batch_idx)
_, loss= sess.run([trainOptimizer, train_loss], feed_dict={X:train_batch_input,Y:train_batch_label})
if (epoch+1) % 10 == 0:
prediction2 = sess.run(prediction, feed_dict={X:valid_input})
valid_error = np.mean(np.sum(np.square(np.linalg.norm(prediction2[:,:,6:75] - valid_label[:,:,6:75], axis = 2)), axis = 1), axis = 0)
print("Epoch: %05d tL: %.4f vE: %.4f" % (epoch+1, loss, valid_error))
结果如下
Epoch: 00010 tL: nan vE: 4.3044
Epoch: 00020 tL: nan vE: 4.3114
Epoch: 00030 tL: nan vE: 4.2962
Epoch: 00040 tL: nan vE: 4.3009
Epoch: 00050 tL: nan vE: 4.2899
无论训练数据有多小,训练损失始终是微不足道的,因此我认为根本的问题在于我在哪里进行训练。验证错误不是nan,所以我想数据本身不包含nan。 我没有在代码中解决一个关键问题吗?任何帮助,将不胜感激!预先感谢。
答案 0 :(得分:0)
之所以验证错误显示正常值而训练错误没有显示原因,是因为我正在制作具有nan值的小型批次。
显然
Option Explicit
Private Sub CommandButton1_Click()
Dim MyFolder As String, MyFile As String
Dim StartTime As Double
Dim MinutesElapsed As String
Dim Filename As String
Dim Cell As String
Dim Counter As Long
If ThisWorkbook.Sheets("Sheet1").Range("C7").Value = vbNullString Then
MsgBox "Enter Tab Name"
Exit Sub
End If
StartTime = Timer
With Application.FileDialog(msoFileDialogFolderPicker)
.AllowMultiSelect = False
.Title = "Select a Folder"
If .Show = True Then
MyFolder = .SelectedItems(1)
End If
If .SelectedItems.Count = 0 Then Exit Sub
Err.Clear
End With
'Turns settings off
Application.ScreenUpdating = False
Application.DisplayStatusBar = False
Application.EnableEvents = False
Application.Calculation = xlCalculationManual
MyFile = Dir(MyFolder & "\", vbReadOnly)
Do While MyFile <> ""
DoEvents
On Error GoTo 0
Workbooks.Open Filename:=MyFolder & "\" & MyFile, UpdateLinks:=False
Dim ReportSheet As Worksheet
Dim MySheet As String
Dim allColumns As Range
MySheet = ThisWorkbook.Sheets("Sheet1").Range("C7").Value
Set ReportSheet = Sheets(MySheet)
Set allColumns = ReportSheet.Columns("N:S")
allColumns.Hidden = True
With ReportSheet.PageSetup
.Zoom = False
.FitToPagesWide = 1 '.FitToPagesTall = 1
End With
Filename = ActiveWorkbook.Name
Cell = Replace(Filename, ".xlsx", ".PDF")
ReportSheet.Select
ReportSheet.PageSetup.Orientation = xlLandscape
ReportSheet.ExportAsFixedFormat Type:=xlTypePDF, Filename:=ThisWorkbook.Path & "\" & Cell, _
Quality:=xlQualityStandard, IncludeDocProperties:=True, _
IgnorePrintAreas:=True, OpenAfterPublish:=False
Counter = Counter + 1
Workbooks(MyFile).Close SaveChanges:=False
MyFile = Dir
Loop
'turns settings back on that you turned off before looping folders
Application.ScreenUpdating = True
Application.DisplayStatusBar = True
Application.EnableEvents = True
Application.Calculation = xlCalculationManual
MinutesElapsed = Format((Timer - StartTime) / 86400, "hh:mm:ss")
MsgBox "Successfully Converted " & Counter & " Files in " & MinutesElapsed & " minutes", vbInformation
End Sub
和
sample_size = 12494
batch_size = 128
total_batch = int(sample_size / batch_size)
train_size = int(sample_size * (1 - prop_valid))
没有道理。 for batch_idx in range(total_batch):
train_batch_input = mini_batch(train_input, batch_size, batch_idx)
train_batch_label = mini_batch(train_label, batch_size, batch_idx)
应该是total_batch
真的很难找到原因,当数组切片超出范围时numpy不会返回任何错误。
无论如何,希望它能在将来帮助类似问题的人!