Question

因此，当我运行此命令时，它可以完美运行，但是由于某种原因，奖励上限为200。我不确定是什么原因造成的。我是机器学习的新手，这是我的第一个项目，对不起，如果我错过了一些愚蠢的事情。我假设done也在我想要之前就被触发了，但是玩这个并没有带来任何结果。非常感谢。

import gym
import tensorflow as tf
import numpy as np
import os
import sys

env = gym.make('CartPole-v0')
discount_rate=.95

# TODO Build the policy gradient neural network
class Agent:
     def __init__(self, num_actions, state_size):

        initializer = tf.contrib.layers.xavier_initializer()

        self.input_layer = tf.placeholder(dtype=tf.float32, shape=[None, state_size])

        # Neural net starts here

        hidden_layer = tf.layers.dense(self.input_layer, 8, activation=tf.nn.relu, kernel_initializer=initializer)
        hidden_layer_2 = tf.layers.dense(hidden_layer, 8, activation=tf.nn.relu, kernel_initializer=initializer)

        # Output of neural net
        out = tf.layers.dense(hidden_layer_2, num_actions, activation=None)

        self.outputs = tf.nn.softmax(out)
        self.choice = tf.argmax(self.outputs, axis=1)

        # Training Procedure
        self.rewards = tf.placeholder(shape=[None, ], dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None, ], dtype=tf.int32)

        one_hot_actions = tf.one_hot(self.actions, num_actions)

        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=one_hot_actions)

        self.loss = tf.reduce_mean(cross_entropy * self.rewards)

        self.gradients = tf.gradients(self.loss, tf.trainable_variables())

        # Create a placeholder list for gradients
        self.gradients_to_apply = []
        for index, variable in enumerate(tf.trainable_variables()):
            gradient_placeholder = tf.placeholder(tf.float32)
            self.gradients_to_apply.append(gradient_placeholder)

        # Create the operation to update gradients with the gradients placeholder.
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
        self.update_gradients = 
optimizer.apply_gradients(zip(self.gradients_to_apply, tf.trainable_variables()))



def discount_normalize_rewards(rewards):
    discounted_rewards = np.zeros_like(rewards)
    total_rewards = 0

    for i in reversed(range(len(rewards))):
        total_rewards = total_rewards * discount_rate + rewards[i]
        discounted_rewards[i] = total_rewards

    discounted_rewards -= np.mean(discounted_rewards)
    discounted_rewards /= np.std(discounted_rewards)

    return discounted_rewards


#initialize the training loop
tf.reset_default_graph()

# Modify these to match shape of actions and states in your environment
num_actions = 2
state_size = 4

path = "./cartpole-pg/"

training_episodes = 1000
max_steps_per_episode = 20000
episode_batch_size = 5

agent = Agent(num_actions, state_size)

init = tf.global_variables_initializer()

saver = tf.train.Saver(max_to_keep=2)

if not os.path.exists(path):
   os.makedirs(path)

with tf.Session() as sess:
    sess.run(init)

total_episode_rewards = []

# Create a buffer of 0'd gradients
gradient_buffer = sess.run(tf.trainable_variables())
for index, gradient in enumerate(gradient_buffer):
    gradient_buffer[index] = gradient * 0

for episode in range(training_episodes):

    state = env.reset()

    episode_history = []
    episode_rewards = 0

    for step in range(max_steps_per_episode):

        if episode % 100 == 0:
            env.render()

        # Get weights for each action
        action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [state]})
        action_choice = np.random.choice(range(num_actions), p=action_probabilities[0])

        state_next, reward, done, _ = env.step(action_choice)
        episode_history.append([state, action_choice, reward, state_next])
        state = state_next

        episode_rewards += reward

        if done:
            total_episode_rewards.append(episode_rewards)
            episode_history = np.array(episode_history)
            episode_history[:,2] = discount_normalize_rewards(episode_history[:,2])

            ep_gradients = sess.run(agent.gradients, feed_dict={agent.input_layer: np.vstack(episode_history[:, 0]),
                                                                agent.actions: episode_history[:, 1],
                                                                agent.rewards: episode_history[:, 2]})
            # add the gradients to the grad buffer:
            for index, gradient in enumerate(ep_gradients):
                gradient_buffer[index] += gradient

            break

    if episode % episode_batch_size == 0:

        feed_dict_gradients = dict(zip(agent.gradients_to_apply, gradient_buffer))

        sess.run(agent.update_gradients, feed_dict=feed_dict_gradients)

        for index, gradient in enumerate(gradient_buffer):
            gradient_buffer[index] = gradient * 0

    if episode % 1 == 0:
        saver.save(sess, path + "pg-checkpoint", episode)
        print("Reward: " + str(total_episode_rewards[-1:]))


env.close()

Answer 1

柱极的情节在极点下降并在200 successful steps时终止。如果要更改此设置，请参见链接文件中的max_episode_steps。最大200步的原因是使评估试用更加容易（例如，您总是可以获取情节的结局，以便可以评估情节的统计信息），并且不会使环境陷入永无止境的试用中。

register(
id='CartPole-v0',
entry_point='gym.envs.classic_control:CartPoleEnv',
max_episode_steps=200,
reward_threshold=195.0,)

机器学习奖励人为设置上限

1 个答案: