我正在使用tensorflow和SpaceInvaders与Gym一起测试RL教程。不幸的是,我的训练从未达到所有的情节(本例中为50)。它总是在10-15集之间关闭。另外,在关闭dwn之后,“ print(episode)”下面提供的代码不起作用,因此会话似乎有问题。
saver = tf.train.Saver()
if training == True:
with tf.Session() as sess:
# Initialize the variables
sess.run(tf.global_variables_initializer())
# Initialize the decay rate (that will be used to reduce epsilon)
decay_step = 0
for episode in range(total_episodes):
print(episode)
step = 0
episode_rewards = []
state = env.reset()
state, stacked_frames = stack_frames(stacked_frames, state, True)
while step < max_steps:
step += 1
decay_step += 1
action, explore_probability = predict_action(explore_start,
explore_stop, decay_rate,
decay_step, state, possible_actions)
next_state, reward, done , info = env.step(np.nonzero(action)[0])
if episode_render:
env.render()
episode_rewards.append(reward)
if done:
next_state = np.zeros((110, 84), dtype=np.int)
next_state, stacked_frames = stack_frames(stacked_frames,
next_state, False)
# Set step = max_steps to end the episode
step = max_steps
# Get the total reward of the episode
total_reward = np.sum(episode_rewards)
print('Episode: {}'.format(episode),
'Total reward: {}'.format(total_reward),
'Explore P: {:.4f}'.format(explore_probability),
'Training Loss {:.4f}'.format(loss))
# Store tuple <st, at, rt+1, st+1> in memory D
memory.add((state, action, reward, next_state, done))
state = next_state
else:
next_state , stacked_frames = stack_frames(stacked_frames, next_state, False)
memory.add((state, action, reward, next_state, done))
state = next_state
## Learning Part
# obtain random mini-batch from memory
batch = memory.sample(batch_size)
states_mb = np.array([each[0] for each in batch], ndmin=3)
actions_mb = np.array([each[1] for each in batch])
rewards_mb = np.array([each[2] for each in batch])
next_states_mb = np.array([each[3] for each in batch], ndmin=3)
dones_mb = np.array([each[4] for each in batch])
target_Qs_batch = []
# Get Q values for next state
Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
# Set Q target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*max(Q(s',a'))
for i in range(0, len(batch)):
terminal = dones_mb[i]
# if we are in terminal then only equals reward
if terminal:
target_Qs_batch.append(rewards_mb[i])
else:
target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
target_Qs_batch.append(target)
targets_mb = np.array([each for each in target_Qs_batch])
loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
feed_dict={DQNetwork.inputs_: states_mb,
DQNetwork.target_Q: targets_mb,
DQNetwork.actions_: actions_mb})
# Write TF Summaries
summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
DQNetwork.target_Q: targets_mb,
DQNetwork.actions_: actions_mb})
writer.add_summary(summary, episode)
writer.flush()
# Save model every 5 episodes
if episode % 5 == 0:
save_path = saver.save(sess, "./models/Space_Invaders_model.ckpt")
print("Model Saved")
在spyder的控制台上也没有错误消息。实际输出为:
Episode: 10 Total reward: 370.0 Explore P: 0.9250 Training Loss 0.0252
Model Saved
11
Episode: 11 Total reward: 100.0 Explore P: 0.9200 Training Loss 3.4623
12
Episode: 12 Total reward: 200.0 Explore P: 0.9117 Training Loss 22.2125
13
但是它应该一直上升到50(总共情节)