此代码如何:
def policy_gradient():
with tf.variable_scope("policy"):
params = tf.get_variable("policy_parameters",[4,2])
state = tf.placeholder("float",[None,4])
actions = tf.placeholder("float",[None,2])
advantages = tf.placeholder("float",[None,1])
linear = tf.matmul(state,params)
probabilities = tf.nn.softmax(linear)
good_probabilities = tf.reduce_sum(tf.mul(probabilities, actions),reduction_indices=[1])
eligibility = tf.log(good_probabilities) * advantages
loss = -tf.reduce_sum(eligibility)
tf.summary.scalar("loss", loss)
optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)
log_ph = tf.placeholder("float", [])
tf.summary.scalar("totalreward log", log_ph)
return probabilities, state, actions, advantages, optimizer, log_ph
奖励正在正确地增加,并且实现是openai的官方解决方案,所以我很确定该算法是正确的...
来源:https://github.com/kvfrans/openai-cartpole/blob/master/cartpole-policygradient.py