A2C强化学习无法学习

时间:2019-06-04 13:30:49

标签: python tensorflow reinforcement-learning

我在CartPole-v0环境中的Tensorflow 2.0中训练了A2C模型。 现在,我切换到car environment (continuous),最终目的是在一个通道中合并多个特工。 现在只有一个不同的动作空间,它具有自我车辆的2D矢量(加速度,转向角),而不是CartPole的左右方向。观察空间也是[jerk,y_Deviation,v_deviation,Collissions_with_agent_or_road]的4D向量。 在CartPole中,我基于分类分布选择了动作,现在,由于它是一个连续的环境,因此我想到了一个Beta分布来采样动作。 但是,它不能正常工作,我尝试修改几乎所有参数,但似乎不起作用。 以下是我针对代理以及网络的代码:

class A2CAgent:
def __init__(self, model):
    # hyperparameters for loss terms, gamma is the discount coefficient
    self.params = {
        'gamma': 0.9,
        'value': 0.1,
        'entropy': 0.1
    }
    self.model = model
    self.model.compile(
        optimizer=ko.Adam(lr=0.0003),
        # define separate losses for policy logits and value estimate
        loss=[self._logits_loss, self._value_loss]
    )
    self.buffer = rp.ReplayBuffer(buff_size, mini_batch=32)

def training_step(self, step, observations, actions, values, rewards, next_obs, dones, ep_rews):
    observations[step] = next_obs.copy()  # fill observations from environment
    actions[step], values[step] = self.model.action_value(next_obs[None, :])  # get action and its value
    next_obs, rewards[step], dones[step], _ = env.step(
        actions[step])  # perform next step to get new observation, and reward


    ep_rews[-1] += rewards[step]  # only update last column
    if dones[step]:
        ep_rews.append(0.0)  # create new column
        next_obs = env.reset()
        logging.info("Episode: %03d, Reward: %03d" % (len(ep_rews) - 1, ep_rews[-2]))
    return next_obs, ep_rews, rewards, dones, values, observations

def train(self, env, batch_sz=126, updates=100, callbacks=None):
    # storage helpers for a single batch of data
    actions = np.empty((batch_sz,2), dtype=np.int32)  # (32, )
    rewards, dones, values = np.empty((3, batch_sz))  # (3, 32) - ever single has shape (32, )
    observations = np.empty((batch_sz,) + (8,))  # (32, 4)
    # training loop: collect samples, send to optimizer, repeat updates times
    ep_rews = [0.0]
    next_obs = env.reset()
    #print(init_next_obs)
    next_obs = next_obs
    for update in range(updates):
        for step in range(batch_sz):
            next_obs, ep_rews, rewards, dones, values, observations = self.training_step(step, observations,
                                                                                         actions, values, rewards,
                                                                                         next_obs, dones, ep_rews)

        _, next_value = self.model.action_value(next_obs[None, :])  # get next value (int) of action
        # rewi = [x[2] for x in (self.buffer.experience[:self.buffer.current_index])] # extract rewards from buffer
        returns, advs = self._returns_advantages(rewards, dones, values, next_value)
        # returns are cumulative rewards and advantages are returns - baseline
        acts_and_advs = np.concatenate([actions, advs[:, None]], axis=-1)  # (32, 3) (action, advantage)
        #acts_and_advs = [actions, advs]  # (32, 2) (action, advantage)
        # performs a full training step on the collected batch
        # note: no need to mess around with gradients, Keras API handles it
        #print(acts_and_advs)
        losses = self.model.train_on_batch(observations, [acts_and_advs,returns])
        #logging.info("[%d/%d] Losses: %s" % (update + 1, updates, losses))

    return ep_rews

def test(self, env, render=False):
    steps = 0
    obs, done, ep_reward = env.reset(), False, 0
    while not done:
        steps+=1
        action, _ = self.model.action_value(obs[None, :])
        obs, reward, done, _ = env.step(action)
        ep_reward += reward
        if render:
            env.draw()
            plt.pause(0.001)
    print(steps)
    return ep_reward

def _returns_advantages(self, rewards, dones, values, next_value):
    # next_value is value estimate of a future state (the critic)
    returns = np.append(np.zeros_like(rewards), next_value, axis=-1)  # (33, ) - 32 zeroes and 1 last, value
    # returns are  discounted sum of future rewards
    for t in reversed(range(rewards.shape[0])):  # batch_size range - t starts at 31
        returns[t] = rewards[t] + self.params['gamma'] * returns[t + 1] * (1 - dones[t])
    returns = returns[:-1]  # get rid of last element
    # advantages are returns - baseline (= value estimates in our case)
    advantages = returns - values  # (32, ) = len(32) - len(32)
    return returns, advantages

def _value_loss(self, returns, value):  # Q-Value 
    # value loss is typically MSE between value estimates and returns
    return self.params['value'] * kls.mean_squared_error(returns, value)

def _logits_loss(self, acts_and_advs, logits):  # is the policy in general good
    # a trick to input actions and advantages through same API
    actions = acts_and_advs[...,:2]
    advantages = acts_and_advs[...,2:]
    # sparse categorical CE loss obj that supports sample_weight arg on call()
    # from_logits argument ensures transformation into normalized probabilities
    weighted_sparse_ce = kls.MeanSquaredError()
    # policy loss is defined by policy gradients, weighted by advantages
    # note: we only calculate the loss on the actions we've actually taken
    actions = tf.cast(actions, tf.int32)
    policy_loss = weighted_sparse_ce(actions, advantages, sample_weight=advantages)
    policy_loss = np.sum(policy_loss)
    entropy_loss = kls.binary_crossentropy(logits, logits, from_logits=True)        # entropy loss can be calculated via CE over itself
    # entropy_loss = kls.categorical_crossentropy(logits, logits, from_logits=True)
    # here signs are flipped because optimizer minimizes
    return (1/policy_loss - self.params['entropy'] * entropy_loss)


if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)

env = DivineEnvironment("examples/data/merging.json",
                      ego_agent_id=1,
                      camera_follow=True,
                      idm_enabled=True)
network = nw.Network(num_actions=4) # learn beta distribution parameter for both actions [acceleration, steering angle]
agent = A2CAgent(network)

rewards_history = agent.train(env)
for _ in range(0, 5):
    print("Total Episode Reward: %d out of 200" % agent.test(env, True))
# print("Finished training.")
#print("Total Episode Reward: %d out of 200" % agent.test(env, True))

和Tensorflow中的网络:

class Network(tf.keras.Model):
def __init__(self, num_actions):
    # mlp multi layer perceptioin
    super().__init__('mlp_policy')
    # hidden1 for the Actor model
    self.hidden1 = tf.keras.Sequential()
    self.hidden1.add(kl.Dense(512, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.001),
    kernel_initializer=tf.keras.initializers.lecun_normal(seed=None)))
    self.hidden1.add(kl.GaussianDropout(0.4))
    self.hidden1.add(kl.Dense(512, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.001)))
   # hidden2 for the critic model
    self.hidden2 = tf.keras.Sequential()
    self.hidden2.add(kl.Dense(128, activation='relu', name='hidden2_output'))
    self.value = kl.Dense(1,name='value')
    # logits are unnormalized log probabilities
    self.logits = kl.Dense(num_actions, kernel_regularizer=tf.keras.regularizers.l2(0.002),
    kernel_initializer=tf.keras.initializers.lecun_normal(seed=None),
     name='policy_logits')
    # self.dist = tfp.layers.DistributionLambda(lambda t: tfp.distributions.Beta(concentration1=t[:,0:2], concentration0=t[:,2:]),
    #self.dist = tfp.layers.DistributionLambda(lambda t: tfp.distributions.Beta(concentration1=2, concentration0=t[:,0:2]),
    # convert_to_tensor_fn=lambda s: s.sample(),
    self.dist = tfp.layers.IndependentNormal(2,
    name='probability_layer')

def call(self, inputs):
    # inputs is a numpy array, convert to Tensor
    x = tf.convert_to_tensor(inputs)
    # separate hidden layers from the same input tensor
    hidden_logs = self.hidden1(x)
    hidden_out = self.logits(hidden_logs)
    hidden_vals = self.hidden2(x)
    return tf.convert_to_tensor(self.dist(hidden_out)), self.value(hidden_vals) ## comment if you want to use logits

def action_value(self, obs):
    # executes call() under the hood
    action, value = self.predict(obs)
    return action, np.squeeze(value, axis=-1)

我很高兴听到有关强化学习模型的一些技巧和/或最佳实践,因为我只是从这个话题开始。

0 个答案:

没有答案