DQN算法无法在CartPole-v0上收敛

时间:2019-04-06 18:47:38

标签: python tensorflow reinforcement-learning

我的模型的简短说明

我正在尝试使用Python(Mnih et al., 2015之后的Tensorflow用Python编写自己的DQN算法。在train_DQN函数中,我定义了训练过程,而DQN_CartPole用于定义函数逼近(简单的三层神经网络)。对于损失函数,先执行Huber损失或MSE,然后进行梯度裁剪(介于-1和1之间)。然后,我通过在主网络中复制权重来实现目标网络的软更新而不是硬更新。

问题

我正在CartPole环境(OpenAI体育馆)上进行尝试,但是回报并没有像其他人的算法(例如keras-rl)那样提高。任何帮助将不胜感激。

reward over timestep

如果可能,您可以看一下源代码吗?

class Parameters:
    def __init__(self, mode=None):
        assert mode != None
        print("Loading Params for {} Environment".format(mode))
        if mode == "Atari":
            self.state_reshape = (1, 84, 84, 1)
            self.num_frames = 1000000
            self.memory_size = 10000
            self.learning_start = 10000
            self.sync_freq = 1000
            self.batch_size = 32
            self.gamma = 0.99
            self.update_hard_or_soft = "soft"
            self.soft_update_tau = 1e-2
            self.epsilon_start = 1.0
            self.epsilon_end = 0.01
            self.decay_steps = 1000
            self.prioritized_replay_alpha = 0.6
            self.prioritized_replay_beta_start = 0.4
            self.prioritized_replay_beta_end = 1.0
            self.prioritized_replay_noise = 1e-6
        elif mode == "CartPole":
            self.state_reshape = (1, 4)
            self.num_frames = 10000
            self.memory_size = 20000
            self.learning_start = 100
            self.sync_freq = 100
            self.batch_size = 32
            self.gamma = 0.99
            self.update_hard_or_soft = "soft"
            self.soft_update_tau = 1e-2
            self.epsilon_start = 1.0
            self.epsilon_end = 0.01
            self.decay_steps = 500
            self.prioritized_replay_alpha = 0.6
            self.prioritized_replay_beta_start = 0.4
            self.prioritized_replay_beta_end = 1.0
            self.prioritized_replay_noise = 1e-6


class _DQN:
    """
    Boilerplate for DQN Agent
    """

    def __init__(self):
        """
        define the deep learning model here!

        """
        pass

    def predict(self, sess, state):
        """
        predict q-values given a state

        :param sess:
        :param state:
        :return:
        """
        return sess.run(self.pred, feed_dict={self.state: state})

    def update(self, sess, state, action, Y):
        feed_dict = {self.state: state, self.action: action, self.Y: Y}
        _, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
        # print(action, Y, sess.run(self.idx_flattened, feed_dict=feed_dict))
        return loss


class DQN_CartPole(_DQN):
    """
    DQN Agent for CartPole game
    """

    def __init__(self, scope, env, loss_fn ="MSE"):
        self.scope = scope
        self.num_action = env.action_space.n
        with tf.variable_scope(scope):
            self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32, name="X")
            self.Y = tf.placeholder(shape=[None], dtype=tf.float32, name="Y")
            self.action = tf.placeholder(shape=[None], dtype=tf.int32, name="action")

            fc1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(self.state)
            fc2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc1)
            fc3 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc2)
            self.pred = tf.keras.layers.Dense(self.num_action, activation=tf.nn.relu)(fc3)

            # indices of the executed actions
            self.idx_flattened = tf.range(0, tf.shape(self.pred)[0]) * tf.shape(self.pred)[1] + self.action

            # passing [-1] to tf.reshape means flatten the array
            # using tf.gather, associate Q-values with the executed actions
            self.action_probs = tf.gather(tf.reshape(self.pred, [-1]), self.idx_flattened)

            if loss_fn == "huber_loss":
                # use huber loss
                self.losses = tf.subtract(self.Y, self.action_probs)
                self.loss = huber_loss(self.losses)
            elif loss_fn == "MSE":
                # use MSE
                self.losses = tf.squared_difference(self.Y, self.action_probs)
                self.loss = tf.reduce_mean(self.losses)
            else:
                assert False

            # you can choose whatever you want for the optimiser
            # self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
            self.optimizer = tf.train.AdamOptimizer()

            # to apply Gradient Clipping, we have to directly operate on the optimiser
            # check this: https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#processing_gradients_before_applying_them
            self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
            self.clipped_grads_and_vars = [(ClipIfNotNone(grad, -1., 1.), var) for grad, var in self.grads_and_vars]
            self.train_op = self.optimizer.apply_gradients(self.clipped_grads_and_vars)



def train_DQN(main_model, target_model, env, replay_buffer, policy, params):
    """
    Train DQN agent which defined above

    :param main_model:
    :param target_model:
    :param env:
    :param params:
    :return:
    """

    # log purpose
    losses, all_rewards, cnt_action = [], [], []
    episode_reward, index_episode = 0, 0

    with tf.Session() as sess:
        # initialise all variables used in the model
        sess.run(tf.global_variables_initializer())
        state = env.reset()
        start = time.time()
        for frame_idx in range(1, params.num_frames + 1):
            action = policy.select_action(sess, target_model, state.reshape(params.state_reshape))
            cnt_action.append(action)
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward

            if done:
                index_episode += 1
                state = env.reset()
                all_rewards.append(episode_reward)

                if frame_idx > params.learning_start and len(replay_buffer) > params.batch_size:
                    states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
                    next_Q = target_model.predict(sess, next_states)
                    Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
                    loss = main_model.update(sess, states, actions, Y)

                    # Logging and refreshing log purpose values
                    losses.append(np.mean(loss))

                    logging(frame_idx, params.num_frames, index_episode, time.time()-start, episode_reward, np.mean(loss), cnt_action)

                episode_reward = 0
                cnt_action = []
                start = time.time()

            if frame_idx > params.learning_start and frame_idx % params.sync_freq == 0:
                # soft update means we partially add the original weights of target model instead of completely
                # sharing the weights among main and target models
                if params.update_hard_or_soft == "hard":
                    sync_main_target(sess, main_model, target_model)
                elif params.update_hard_or_soft == "soft":
                    soft_target_model_update(sess, main_model, target_model, tau=params.soft_update_tau)


    return all_rewards, losses

修改

  • dones-> np.logical_not(dones)
  • np.argmax-> np.max
  • 将MSE与huber_loss分开

1 个答案:

答案 0 :(得分:1)

简而言之,似乎dones变量是一个二进制向量,其中1表示已完成,而0表示未完成。

然后您在此处使用dones

Y = rewards + params.gamma * np.argmax(next_Q, axis=1) * dones

因此,对于所有终止过渡,您在遵循情节其余部分(为零)的策略时会添加预期的累积奖励。对于所有非终止过渡,您不会添加预期的累积奖励。

我认为您是要反过来做,也许将上面代码行中的dones换成np.logical_not(dones)

此外,现在我来看,这行似乎还有另一个主要问题。 np.argmax(next_Q, axis=1)返回next_Q向量中最大值的索引,而不是实际最大值。您需要np.maximum(next_Q, axis=1)(IIRC)才能获得下一状态操作的最大预期奖励。

编辑:损失函数的定义也很奇怪。您正在将Huber损失与均方误差混合在一起。如果要使用huber_loss或MSE,则只需根据预期值和预测值之间的差进行计算。您似乎同时在做这两种事情,这当然不是通常定义的损失函数。例如,使用Huber Loss的模型损失应为:

self.loss = tf.reduce_mean(huber_loss(abs(self.Y - self.action_probs)))