我正在尝试编写DQL算法,并且我尝试在tensorflow上运行以下图表
class DQN:
def __init__(self, env, n_hidden, learning_rate):
self.image_input = tf.placeholder(shape=[None, 128,128,3], dtype=tf.float32)
self.conv1 = tf.contrib.layers.convolution2d(inputs=self.image_input, num_outputs=32,
kernel_size=[8,8], stride=[4,4], padding="VALID")
self.conv2 = tf.contrib.layers.convolution2d(inputs=self.conv1, num_outputs=64,
kernel_size=[4,4], stride=[2,2], padding="VALID")
self.conv3 = tf.contrib.layers.convolution2d(inputs=self.conv2, num_outputs=64,
kernel_size=[3,3], stride=[1,1], padding="VALID")
self.conv4 = tf.contrib.layers.convolution2d(inputs=self.conv3, num_outputs=512,
kernel_size=[7,7], stride=[1,1], padding="VALID")
self.conv_out = tf.contrib.layers.flatten(self.conv4)
self.weights_1 = tf.Variable(tf.random_normal([18432, env.action_space.n], stddev=0.35), name="fully1_w")
self.bias_1 = tf.Variable(tf.zeros(env.action_space.n), name="fully1_b")
self.q_out = tf.add(tf.matmul(self.conv_out, self.weights_1), self.bias_1, name="q_out")
self.predict = tf.argmax(self.q_out, 1)
self.target_q = tf.placeholder(shape=[None],dtype=tf.float32)
self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
self.actions_onehot = tf.one_hot(self.actions,env.action_space.n,dtype=tf.float32)
self.q_value = tf.reduce_sum(tf.multiply(self.q_out, self.actions_onehot), reduction_indices=1)
self.td_error = tf.square(self.target_q - self.q_value)
self.loss = tf.reduce_mean(self.td_error)
self.trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
self.grads_and_vars = self.trainer.compute_gradients(self.loss)
self.trainer.apply_gradients(self.grads_and_vars)
以下是培训程序:
tf.reset_default_graph()
main_qf = DQN(env, n_hidden=10, learning_rate=1.0)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
trainables = tf.trainable_variables()
target_ops = update_target_graph(trainables,tau, mode="periodically")
grads=[]
experience_buffer = ExperienceReplay(exp_size)
total_rewards = np.zeros(num_episodes)
losses = np.zeros(num_episodes)
with tf.Session() as session:
state = env.reset()
session.run(init)
update_target(target_ops, session)
for _iter in range(num_episodes):
state = env.reset()
# play ===================================================================================
done = False
img = process_image(env.render(mode="rgb_array"))
episode = []
while not done:
#e-greedy
if np.random.rand() < epsilon:
action = np.random.choice(range(env.action_space.n))
else:
feed_dict = {main_qf.image_input: img[None,:,:,:]}
action = session.run(main_qf.predict, feed_dict=feed_dict)[0]
new_state, reward, done, _ = env.step(action)
new_img = process_image(env.render(mode="rgb_array"))
experience_buffer.add((img, action, new_img,reward, done))
# update results =========================================================================
total_rewards[_iter] += reward
# Adjust params (epsilon) ===============================================================
if epsilon >= min_epsilon:
epsilon -= decay
# train ==================================================================================
prev_state, actions, new_state, rewards, is_terminal = experience_buffer.sample(batch_size)
q_function = session.run([main_qf.q_out], feed_dict={
main_qf.image_input:prev_state})
q_target = session.run([main_qf.predict], feed_dict={
main_qf.image_input:new_state})
q_target = rewards + gamma * q_target * is_terminal
loss, weights, grad = session.run([main_qf.loss,main_qf.weights_1, main_qf.grads_and_vars], feed_dict={
main_qf.image_input : prev_state,
main_qf.target_q : q_target,
main_qf.actions : actions
})
losses[_iter] = loss
update_target(target_ops, session)
但由于某种原因,我不明白培训程序不会更新网络的权重。我试图获取渐变来检查我是否消失渐变(获得grads_and_vars),但事实并非如此,渐变具有很大的值。我还尝试手动为变量赋值(通过调用main_qf.weights1.assing(val)),但它也不起作用。
这是我图表组成中的内容吗?或者顺便说一下,我正在运行会话?我完全失去了这个。
答案 0 :(得分:1)
目前,您的图表并未要求最小化损失或更新渐变。
将更新权重的图元素是&#34; self.trainer.apply_gradients(self.grads_and_vars)&#34;操作。我从你的session.run()中看不到你把这个操作调用的地方。
尝试将其分配给变量并将其添加到run()中,并且应该更新权重。
self.UpdateWeights = self.trainer.apply_gradients(self.grads_and_vars)
如果您只是添加&#34; self.trainer&#34;对于你的run()你不会更新渐变,除非你添加最小化(self.loss),然后你不需要Calc / Apply渐变线。
self.trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(self.loss)