Question

我正在使用tensorflow和SpaceInvaders与Gym一起测试RL教程。不幸的是，我的训练从未达到所有的情节（本例中为50）。它总是在10-15集之间关闭。另外，在关闭dwn之后，“ print（episode）”下面提供的代码不起作用，因此会话似乎有问题。

saver = tf.train.Saver()

if training == True:
with tf.Session() as sess:
    # Initialize the variables
    sess.run(tf.global_variables_initializer())

    # Initialize the decay rate (that will be used to reduce epsilon)
    decay_step = 0

    for episode in range(total_episodes):
        print(episode)
        step = 0 
        episode_rewards = []
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)

        while step < max_steps:
            step += 1
            decay_step += 1

            action, explore_probability = predict_action(explore_start, 
                        explore_stop, decay_rate, 
                        decay_step, state, possible_actions)

            next_state, reward, done , info = env.step(np.nonzero(action)[0])

            if episode_render:
                env.render()

            episode_rewards.append(reward)

            if done:
                next_state = np.zeros((110, 84), dtype=np.int)
                next_state, stacked_frames = stack_frames(stacked_frames, 
                                                          next_state, False)
              # Set step = max_steps to end the episode
                step = max_steps

                # Get the total reward of the episode
                total_reward = np.sum(episode_rewards)

                print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_reward),
                              'Explore P: {:.4f}'.format(explore_probability),
                            'Training Loss {:.4f}'.format(loss))

                # Store tuple <st, at, rt+1, st+1> in memory D
                memory.add((state, action, reward, next_state, done))

                state = next_state

            else:
                next_state , stacked_frames = stack_frames(stacked_frames, next_state, False)

                memory.add((state, action, reward, next_state, done))

                state = next_state

        ## Learning Part
        # obtain random mini-batch from memory
            batch = memory.sample(batch_size)
            states_mb = np.array([each[0] for each in batch], ndmin=3)
            actions_mb = np.array([each[1] for each in batch])
            rewards_mb = np.array([each[2] for each in batch])
            next_states_mb = np.array([each[3] for each in batch], ndmin=3)
            dones_mb = np.array([each[4] for each in batch])

            target_Qs_batch = []

        # Get Q values for next state
            Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})

        # Set Q target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*max(Q(s',a'))
            for i in range(0, len(batch)):
                terminal = dones_mb[i]

                # if we are in terminal then only equals reward
                if terminal:
                    target_Qs_batch.append(rewards_mb[i])
                else:
                    target =  rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                    target_Qs_batch.append(target)

            targets_mb = np.array([each for each in target_Qs_batch])

            loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                           feed_dict={DQNetwork.inputs_: states_mb,
                           DQNetwork.target_Q: targets_mb,
                           DQNetwork.actions_: actions_mb})

                # Write TF Summaries
            summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                           DQNetwork.target_Q: targets_mb,
                                           DQNetwork.actions_: actions_mb})
            writer.add_summary(summary, episode)
            writer.flush()

        # Save model every 5 episodes
        if episode % 5 == 0:
            save_path = saver.save(sess, "./models/Space_Invaders_model.ckpt")
            print("Model Saved")

在spyder的控制台上也没有错误消息。实际输出为：

Episode: 10 Total reward: 370.0 Explore P: 0.9250 Training Loss 0.0252
Model Saved
11
Episode: 11 Total reward: 100.0 Explore P: 0.9200 Training Loss 3.4623
12
Episode: 12 Total reward: 200.0 Explore P: 0.9117 Training Loss 22.2125
13

但是它应该一直上升到50（总共情节）

Tensorflow会话突然关闭

0 个答案: