我试图实现与tensorflow经验重播一个DQN。它似乎正在起作用,即我的损失正在减少。但是,随着训练循环的运行,我注意到每次训练迭代的速度都越来越慢。好像我的张量流图变得越来越大并且放慢了训练速度。我看不到自己的代码有什么问题。那里的任何tensorflow专家都可以指出吗?我在这里制作了按比例缩小版本的代码,该版本可对随机数据进行操作,但会产生相同的问题。
import numpy as np
import tensorflow as tf
# Function which initializes tensorflow weights for feed-forward NN.
def InitWeights(LayerSizes):
# Make tensorflow input/output placeholders
X = tf.placeholder(shape = (None,LayerSizes[0]), dtype = tf.float32, name ='InputData')
y = tf.placeholder(shape = (None,LayerSizes[-1]), dtype = tf.float32, name ='OutputData')
# Initialize dictionaries for weights and biases.
W = {}
b = {}
for ii in range(len(LayerSizes)-1):
layername = f'layer%s' % ii
with tf.variable_scope(layername):
ny = LayerSizes[ii]
nx = LayerSizes[ii+1]
# Weights (initialized with xavier initializatiion).
W['Weights_'+layername] = tf.get_variable(
name = 'Weights_'+layername,
shape = (ny, nx),
initializer = tf.contrib.layers.xavier_initializer(),
dtype = tf.float32
)
# Bias (initialized with xavier initializatiion).
b['Bias_'+layername] = tf.get_variable(
name = 'Bias_'+layername,
shape = (nx),
initializer = tf.contrib.layers.xavier_initializer(),
dtype = tf.float32
)
return W, b, X, y
# Function which defines feed-forward neural network operation.
def FeedForward(X, W, b):
a = X
# Loop all layers of the network.
for ii in range(len(W)):
# Use name of each layer as index.
layername = f'layer%s' % ii
# Weighted sum: z = input*W + b
z = tf.add(tf.matmul(a, W['Weights_'+layername], name = 'WeightedSum_z_'+layername), b['Bias_'+layername])
# Passed through actication fcn: a = h(z)
if ii == len(W)-1:
a = z
else:
a = tf.nn.relu(z, name = 'activation_a_'+layername)
return a
# Function used for experience replay
def ExperienceReplay(s, a, r, s_prime, gamma, TermState, X, y, yhat, yhatNN2, train_op, loss, sess):
# Inputs:
# s - state(s)
# a - actions(s)
# r - rewards(s)
# s_prime - new state(s)
# gamma - discount factor
# TermState - scalar of which action is termenating
# X - tensorflow placeholder for network inputs
# y - tensorflow placeholder for network outputs
# yhat - tensorflow operation for feed foward with NN 1
# yhatNN2 - tensorflow operation for feed foward with NN 2
# train_op - tensorflow training operation
# loss - tensorflow fcn for calculating loss
# sess - tensorflow session
# Forward pass throught NN2 using s_prime to find max(Q(s',a',theta')).
Q = sess.run(yhatNN2, feed_dict={X : s_prime})
# Actions that NN1 thinks is best @ sprime state
a_argmax = np.argmax(sess.run(yhat, feed_dict={X : s_prime}), axis=1)
# Values from NN2's opinion about the actions NN1 picked.
Qm = np.zeros(len(r))
for obs in range(len(r)):
Qm[obs] = Q[obs,a_argmax[obs]]
# First make all targets equal to NN1's approximation of Q (so the error is 0 in all unobserved cases)
Targets = sess.run(yhat, feed_dict={X : s})
# If the action was experienced, change the target to either real reward or discounted future reward.
for obs in range(len(r)):
# If the action was episode-terminating, use only reward as target.
if int(a[obs]) == TermState:
Targets[obs,int(a[obs])] = r[obs]
# Otherwise use discounted future reward.
else:
Targets[obs,int(a[obs])] = r[obs] + gamma*Qm[obs]
# Gradient decent one step on NN1 weights.
sess.run(train_op, feed_dict={X : s, y : Targets})
# Calculate the losses.
loss_val = sess.run(loss, feed_dict={X : s, y : Targets})
meanloss = np.mean(loss_val)
return loss_val, meanloss
if __name__ == "__main__":
#### Hyperparameter settings
N = 64 # Minibatch size during training
gamma = 0.99 # Discount rate
C = 100 # How many iterations between NN sync NN2 = NN1
lr = 1e-7 # Learning rate of NN during training
nstates = 256 # Number of possible states
nactions = 256 # Number of possible actions
TermState = 255 # Which state ends episode
"""
Initialize tensorflow session and create one NN with two set of weights
"""
# Initialize & configure action-value function Q with random weights theta.
LayerSizes = [nstates, 1024, 1024, nactions]
W, b, X, y = InitWeights(LayerSizes)
# Define loss function to optimize. Here: quadratic loss fcn. (Outputdata-a)^2
yhat = FeedForward(X, W, b)
loss = tf.reduce_sum(tf.square(y - yhat),reduction_indices=[0])
# Define optimizer to use when minimizing loss function.
all_variables = tf.trainable_variables()
optimizer = tf.train.AdamOptimizer(learning_rate = lr)
train_op = optimizer.minimize(loss, var_list = all_variables)
# Initialize target action-value function Qhat with random weights theta_= theta.
with tf.device('/gpu:0'):
W2 = {}
b2 = {}
# Make hard copy of tensorflow Weights and biases
for key in W:
W2[key] = tf.Variable(W[key].initialized_value())
for key in b:
b2[key] = tf.Variable(b[key].initialized_value())
yhatNN2 = FeedForward(X, W2, b2)
# Start tf session and initialize variables.
sess = tf.Session()
sess.run(tf.global_variables_initializer())
## Generate random data representing state transitions <s,a,r,s'>.
# Random states
Ds = np.random.rand(100000,nstates)>0.5
Ds = Ds.astype(np.float32)
# Random actions
Da = np.random.randint(0,nstates,(100000,1)).astype(np.float32)
# Random rewards
Dr = np.random.rand(100000,1).astype(np.float32)
# Random new states
Ds_prime = np.random.rand(100000,nstates)>0.5
Ds_prime = Ds.astype(np.float32)
"""
Pretrain network and report time each C iterations
"""
import time
t0 = time.time()
for i in range(100000):
# Randomly pick minibatch to use
MemsToUse = np.random.choice(len(Dr), N)
s = Ds[MemsToUse,:]
a = Da[MemsToUse,0]
r = Dr[MemsToUse,0]
sprime = Ds_prime[MemsToUse,:]
# Experience replay.
loss_val, meanloss = ExperienceReplay(s, a, r, sprime, gamma,TermState, X, y, yhat, yhatNN2, train_op, loss, sess)
# every C iteration copy NN2 = NN1
if (i % C) == 0:
t1 = time.time()
print('iter: %i meanloss: %0.5f iteration took %0.2f s' %(i,meanloss,t1-t0))
t0 = time.time()
with tf.device('/gpu:0'):
for key in W:
W2[key] = tf.Variable(W[key].initialized_value())
for key in b:
b2[key] = tf.Variable(b[key].initialized_value())
更新:对代码的各个部分进行计时后,它似乎是我的体验重播功能的第一行:
Q = sess.run(yhatNN2, feed_dict={X : s_prime})
这是导致大部分(甚至不是全部)减速的原因。我不明白为什么这样做的逻辑,即程序中发生了几次前馈传递,但似乎只有一次会引起问题。