我已使用本教程实现了A3C算法:https://jaromiru.com/2017/03/26/lets-make-an-a3c-implementation/:
但是现在我想使用相同的逻辑来实现可以并行运行的近端策略优化(PPO)。我在网上找到了一些代码,但是我不太了解如何在我的代码中实现它。
您能帮助我还是给我提供有关如何更改代码以支持PPO的见识?
A3C上的原始代码: What is the way to understand Proximal Policy Optimization Algorithm in RL?
def _build_model(self):
l_input = Input( batch_shape=(None, NUM_STATE) )
l_dense = Dense(16, activation='relu')(l_input)
out_actions = Dense(NUM_ACTIONS, activation='softmax')(l_dense)
out_value = Dense(1, activation='linear')(l_dense)
model = Model(inputs=[l_input], outputs=[out_actions, out_value])
model._make_predict_function() # have to initialize before threading
return model
def _build_graph(self, model):
s_t = tf.placeholder(tf.float32, shape=(None, NUM_STATE))
a_t = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))
r_t = tf.placeholder(tf.float32, shape=(None, 1)) # not immediate, but discounted n step reward
p, v = model(s_t)
log_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-10)
advantage = r_t - v
loss_policy = - log_prob * tf.stop_gradient(advantage) # maximize policy
loss_value = LOSS_V * tf.square(advantage) # minimize value error
entropy = LOSS_ENTROPY * tf.reduce_sum(p * tf.log(p + 1e-10), axis=1, keep_dims=True) # maximize entropy (regularization)
loss_total = tf.reduce_mean(loss_policy + loss_value + entropy)
optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, decay=.99)
minimize = optimizer.minimize(loss_total)
return s_t, a_t, r_t, minimize