A2C无法正常运行,原因是批评家的损失没有收敛

时间:2019-02-16 16:58:14

标签: tensorflow reinforcement-learning

我正在尝试通过使用tensorflow来实现自己的Advantage Actor Critic算法。我将https://github.com/BoYanSTKO/Practical_RL-coursera/blob/master/week5_policy_based/practice_a3c.ipynb中的代码用作如何编写算法的粗略模板。

我在简单的CartPole-v0体育馆环境中进行了尝试,但实施失败。评论家的损失只是爆炸式增长,而当演员的损失很低时,损失就变得很大。

我不确定我在做什么错。有什么帮助吗? :)

我尝试通过2个不同的网络将演员和评论家分开。这也没有帮助。还尝试过微调伽玛和学习率之类的东西,但没有成功。

!/ usr / bin / python

将tensorflow导入为tf 将numpy导入为np 进口体育馆 随机导入 从tensorboardX导入SummaryWriter

ActorCritic()类:

def __init__(self,state_dim,n_actions,learning_rate,gamma=0.99):

    with tf.variable_scope("ActorCritic"):

        self.states_ph = tf.placeholder(tf.float32,(None,state_dim),name="states")
        self.action_ph = tf.placeholder(tf.int32,(None,),name="actions")
        self.n_actions = n_actions
        self.reward_ph = tf.placeholder(tf.float32,(None,),name="rewards")

        self.next_state_values = tf.placeholder(tf.float32,(None,),name="rewards")
        self.is_done_ph = tf.placeholder(tf.float32,(None,),name="rewards")

        net = tf.layers.dense(self.states_ph,24,activation=tf.nn.relu)

        self.logits = tf.layers.dense(net,n_actions,activation=None)
        self.state_values = tf.layers.dense(net,1,activation=None)

        self.action_probs = tf.nn.softmax(self.logits)

        self.log_prob = tf.nn.log_softmax(self.logits)

        self.entropy = -tf.reduce_sum(self.action_probs*self.log_prob,axis=-1,name="entropy")

        self.logp_actions = tf.reduce_sum(self.log_prob*tf.one_hot(self.action_ph,depth=n_actions),axis=-1)

        self.target_state_values = self.reward_ph + gamma*(1.0-self.is_done_ph)*self.next_state_values

        self.advantage = self.target_state_values - self.state_values

        self.actor_loss  = -tf.reduce_mean(self.logp_actions * tf.stop_gradient(self.advantage)) - 0.01*tf.reduce_mean(self.entropy)

        self.critic_loss = tf.reduce_mean(self.advantage**2.0)

        self.train_opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.actor_loss+self.critic_loss)

def train(self,states,actions,rewards,is_done,nxt_state_values_batch):

    sess = tf.get_default_session()
    return sess.run([self.critic_loss,self.actor_loss,self.train_opt],feed_dict={
        self.next_state_values:nxt_state_values_batch,
        self.states_ph:states,
        self.action_ph:actions,
        self.reward_ph:rewards,
        self.is_done_ph:is_done})

def predict_state_values(self,states):
    sess = tf.get_default_session()
    return sess.run(self.state_values,feed_dict={self.states_ph:states})

def sample_actions(self,states):
    sess = tf.get_default_session()
    action_probs = sess.run(self.action_probs,{self.states_ph:states})
    return [ np.random.choice(range(self.n_actions),p=action_prob) for action_prob in action_probs ]

EnvBatch()类:

def __init__(self,env_name,n_envs):

    self.envs = [gym.make(env_name) for env in range(n_envs)]
    self.n_actions = self.envs[0].action_space.n
    self.state_dim = self.envs[0].observation_space.shape[0]

def reset(self):
    return [env.reset().tolist() for env in self.envs ]

def step(self,actions):
    states_batch, rewards_batch, is_done_batch = [], [], []

    for action, env in zip(actions,self.envs):

        s, r , d, _ = env.step(action)
        if d:
            s = env.reset()

        states_batch.append(s)
        rewards_batch.append(r)
        is_done_batch.append(d)

    return np.array(states_batch), np.array(rewards_batch), np.array(is_done_batch)

def评估性能(env_name,agent,nr_runs = 10):

env = gym.make(env_name)
rewards = []

for _ in range(nr_runs):

    state = env.reset()
    is_done = False

    acc_reward = 0.0

    while not is_done:

        action = agent.sample_actions([state])
        nxt_state, reward, is_done, _ = env.step(action[0])
        state = nxt_state
        acc_reward += reward

    rewards.append(acc_reward)

return np.mean(rewards)

tf.reset_default_graph()

env = EnvBatch(“ CartPole-v0”,10) agent = ActorCritic(env.state_dim,env.n_actions,learning_rate = 0.001)

sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer())

state_batch = env.reset() writer = SummaryWriter()

对于范围在(100000)中的i:

actions = agent.sample_actions(state_batch)

nxt_state_batch, rewards_batch, is_done_batch = env.step(actions)

nxt_state_values = agent.predict_state_values(nxt_state_batch).ravel()

critic_loss, actor_loss, _ = agent.train(state_batch,actions,rewards_batch,is_done_batch,nxt_state_values)

writer.add_scalar("actor_loss",actor_loss,i)
writer.add_scalar("critic_loss",critic_loss,i)

if i%50==0:
    test_reward = evaluate_performance("CartPole-v0",agent)
    writer.add_scalar("test_reward",test_reward,i)
    if test_reward > 195:
        print "Done!"

states_batch = nxt_state_batch

sess.close() writer.close()

0 个答案:

没有答案