我正在尝试训练一个简单的RNN来预测3个移位序列的值。我稍微使用了Siraj Raval's code。基本上,他的代码作为二进制分类问题,但我想使用类似的RNN方法浮点数序列。然而,在我删除了最后一层中的softmax和argmax函数后,即使对于二进制输入,浮动输出,我也得到了可怕的结果。然而,结果几乎是静态的(约0.43296~)。这是最终的代码。
from IPython.display import Image
from IPython.core.display import HTML
from __future__ import print_function, division
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
num_epochs = 200
total_series_length = 50000
truncated_backprop_length = 40
state_size = 4
num_classes = 1
echo_step = 3
batch_size = 10
num_batches = total_series_length//batch_size//truncated_backprop_length
#Step 1 - Collect data
#Now generate the training data,
#the input is basically a random binary vector. The output will be the
#“echo” of the input, shifted echo_step steps to the right.
def generateData():
#0,1, 50K samples, 50% chance each chosen
x = np.array(np.random.choice(2, total_series_length, p=[0.5, 0.5]))
#shift 3 steps to the left
y = np.roll(x, echo_step)
#padd beginning 3 values with 0
y[0:echo_step] = 0
#Gives a new shape to an array without changing its data.
#The reshaping takes the whole dataset and puts it into a matrix,
#that later will be sliced up into these mini-batches.
x = x.reshape((batch_size, -1)) # The first index changing slowest, subseries as rows
y = y.reshape((batch_size, -1))
return (x, y)
#Step 2 - Build the Model
#datatype, shape (5, 15) 2D array or matrix, batch size shape for later
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, truncated_backprop_length])
batchY_placeholder = tf.placeholder(tf.float32, [batch_size, truncated_backprop_length])
#and one for the RNN state, 5,4
init_state = tf.placeholder(tf.float32, [batch_size, state_size])
#3 layer recurrent net, one hidden state
#randomly initialize weights
W = tf.Variable(np.random.rand(state_size+1, state_size), dtype=tf.float32)
#anchor, improves convergance, matrix of 0s
b = tf.Variable(np.zeros((1,state_size)), dtype=tf.float32)
W2 = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)
# Unpack columns
#Unpacks the given dimension of a rank-R tensor into rank-(R-1) tensors.
#so a bunch of arrays, 1 batch per time step
inputs_series = tf.unstack(batchX_placeholder, axis=1)
labels_series = tf.unstack(batchY_placeholder, axis=1)
#Forward pass
#state placeholder
current_state = init_state
#series of states through time
states_series = []
#for each set of inputs
#forward pass through the network to get new state value
#store all states in memory
for current_input in inputs_series:
#format input
current_input = tf.reshape(current_input, [batch_size, 1])
#mix both state and input data
input_and_state_concatenated = tf.concat([current_input, current_state],1 ) # Increasing number of columns
#perform matrix multiplication between weights and input, add bias
#squash with a nonlinearity, for probabiolity value
next_state = tf.tanh(tf.matmul(input_and_state_concatenated, W) + b) # Broadcasted addition
#store the state in memory
states_series.append(next_state)
#set current state to next one
current_state = next_state
#calculate loss
#second part of forward pass
#logits short for logistic transform
print(states_series)
logits_series = [tf.matmul(state, W2) + b2 for state in states_series] #Broadcasted addition
#apply softmax nonlinearity for output probability
predictions_series =logits_series # [tf.nn.softmax(logits) for logits in logits_series] #
#measure loss, calculate softmax again on logits, then compute cross entropy
#measures the difference between two probability distributions
#this will return A Tensor of the same shape as labels and of the same type as logits
#with the softmax cross entropy loss.
print("Logits",logits_series)
print("Labels",labels_series)
losses = [tf.squared_difference(labels,logits) for logits, labels in zip(logits_series,labels_series)]
#computes average, one value
total_loss = tf.reduce_mean(losses)
train_step = tf.train.AdamOptimizer(0.1).minimize(total_loss)
#Step 3 Training the network
with tf.Session() as sess:
#we stupidly have to do this everytime, it should just know
#that we initialized these vars. v2 guys, v2..
sess.run(tf.initialize_all_variables())
#interactive mode
plt.ion()
#initialize the figure
plt.figure()
#show the graph
plt.show()
#to show the loss decrease
loss_list = []
for epoch_idx in range(num_epochs):
#generate data at eveery epoch, batches run in epochs
#x,y = simple_differential_data(1,1,total_series_length)
x,y = generateData()
#initialize an empty hidden state
_current_state = np.zeros((batch_size, state_size))
print("New data, epoch", epoch_idx)
#each batch
for batch_idx in range(num_batches):
#starting and ending point per batch
#since weights reoccuer at every layer through time
#These layers will not be unrolled to the beginning of time,
#that would be too computationally expensive, and are therefore truncated
#at a limited number of time-steps
start_idx = batch_idx * truncated_backprop_length
end_idx = start_idx + truncated_backprop_length
batchX = x[:,start_idx:end_idx]
batchY = y[:,start_idx:end_idx]
#run the computation graph, give it the values
#we calculated earlier
_total_loss, _train_step, _current_state, _predictions_series = sess.run(
[total_loss, train_step, current_state, predictions_series],
feed_dict={
batchX_placeholder:batchX,
batchY_placeholder:batchY,
init_state:_current_state
})
loss_list.append(_total_loss)
if batch_idx%100 == 0:
print("Step",batch_idx, "Loss", _total_loss)
print(_predictions_series)
print(batchY)
非常感谢任何帮助
答案 0 :(得分:0)
我会在批处理后打印输入。
我运行批量大小为1的代码,损失似乎很快就会消失(步骤500损失0.000245932)。
事实上,批量大的情况下不会发生这种情况,这表明输入切片/转置的方式可能存在错误。