单元格1：

Question

我正在尝试使用tensorflow训练基本的openai卡特尔环境...

这是我试图运行的代码（由单元格分隔）

单元格1：

import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
import os

%matplotlib inline

单元格2：

env = gym.make('CartPole-v1')

单元格3：

print(env.observation_space)
print(env.action_space)

单元格4：

games_to_play = 10

for i in range(games_to_play):
    # Reset the environment
    obs = env.reset()
    episode_rewards = 0
    done = False

    while not done:
        # Render the environment so we can watch
        env.render()

        # Choose a random action
        action = env.action_space.sample()

        # Take a step in the environment with the chosen action
        obs, reward, done, info = env.step(action)
        episode_rewards += reward

    # Print episode total rewards when done
    print(episode_rewards)

# Close the environment
env.close()

单元格5：

class Agent:
    def __init__(self, num_actions, state_size):

        initializer = tf.contrib.layers.xavier_initializer()

        self.input_layer = tf.placeholder(dtype=tf.float32, shape=[None, 
        state_size])

        # Neural net starts here

        hidden_layer = tf.layers.dense(self.input_layer, 8, 
        activation=tf.nn.relu, kernel_initializer=initializer)
        hidden_layer_2 = tf.layers.dense(hidden_layer, 8, 
        activation=tf.nn.relu, kernel_initializer=initializer)

        # Output of neural net
        out = tf.layers.dense(hidden_layer_2, num_actions, 
        activation=None)

        self.outputs = tf.nn.softmax(out)
        self.choice = tf.argmax(self.outputs, axis=1)

        # Training Procedure
        self.rewards = tf.placeholder(shape=[None, ], dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None, ], dtype=tf.int32)

        one_hot_actions = tf.one_hot(self.actions, num_actions)

        cross_entropy = 
        tf.nn.softmax_cross_entropy_with_logits(logits=out, 
        labels=one_hot_actions)

        self.loss = tf.reduce_mean(cross_entropy * self.rewards)

        self.gradients = tf.gradients(self.loss, tf.trainable_variables())

        # Create a placeholder list for gradients
        self.gradients_to_apply = []
        for index, variable in enumerate(tf.trainable_variables()):
            gradient_placeholder = tf.placeholder(tf.float32)
            self.gradients_to_apply.append(gradient_placeholder)

        # Create the operation to update gradients with the gradients 
        placeholder.
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
        self.update_gradients = 
optimizer.apply_gradients(zip(self.gradients_to_apply, 
tf.trainable_variables()))

单元格6：

discount_rate = 0.95


def discount_normalize_rewards(rewards):
    discounted_rewards = np.zeros_like(rewards)
    total_rewards = 0

    for i in reversed(range(len(rewards))):
        total_rewards = total_rewards * discount_rate + rewards[i]
        discounted_rewards[i] = total_rewards

    discounted_rewards -= np.mean(discounted_rewards)
    discounted_rewards /= np.std(discounted_rewards)

    return discounted_r

CELL 7：

tf.reset_default_graph()

# Modify these to match shape of actions and states in your environment
num_actions = 2
state_size = 4

path = "./cartpole-pg/"

training_episodes = 1000
max_steps_per_episode = 10000
episode_batch_size = 5

agent = Agent(num_actions, state_size)

init = tf.global_variables_initializer()

saver = tf.train.Saver(max_to_keep=2)

if not os.path.exists(path):
    os.makedirs(path)

with tf.Session() as sess:
    sess.run(init)

    total_episode_rewards = []

    # Create a buffer of 0'd gradients
    gradient_buffer = sess.run(tf.trainable_variables())
    for index, gradient in enumerate(gradient_buffer):
        gradient_buffer[index] = gradient * 0

    for episode in range(training_episodes):

        state = env.reset()

        episode_history = []
        episode_rewards = 0

        for step in range(max_steps_per_episode):

            if episode % 10 == 0:
                env.render()

            # Get weights for each action
            action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [state]})
            action_choice = np.random.choice(range(num_actions), p=action_probabilities[0])

            state_next, reward, done, _ = env.step(action_choice)
            episode_history.append([state, action_choice, reward, state_next])
            state = state_next

            episode_rewards += reward

            if done or step + 1 == max_steps_per_episode:
                total_episode_rewards.append(episode_rewards)
                episode_history = np.array(episode_history)
                episode_history[:,2] = discount_normalize_rewards(episode_history[:,2])

                ep_gradients = sess.run(agent.gradients, feed_dict={agent.input_layer: np.vstack(episode_history[:, 0]),
                                                                    agent.actions: episode_history[:, 1],
                                                                    agent.rewards: episode_history[:, 2]})
                # add the gradients to the grad buffer:
                for index, gradient in enumerate(ep_gradients):
                    gradient_buffer[index] += gradient

                break

        if episode % episode_batch_size == 0:

            feed_dict_gradients = dict(zip(agent.gradients_to_apply, gradient_buffer))

            sess.run(agent.update_gradients, feed_dict=feed_dict_gradients)

            for index, gradient in enumerate(gradient_buffer):
                gradient_buffer[index] = gradient * 0

        if episode % 10 == 0:
            saver.save(sess, path + "pg-checkpoint", episode)
            print("Average reward / 100 eps: " + str(np.mean(total_episode_rewards[-100:])))

然后在运行该单元格后出现此错误：

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-13-e284342309c6> in <module>()
     41 
     42             if episode % 10 == 0:
---> 43                 env.render()
     44 
     45             # Get weights for each action

~\AppData\Local\Programs\Python\Python36\lib\site-packages\gym\core.py in render(self, mode)
    282 
    283     def render(self, mode='human'):
--> 284         return self.env.render(mode)
    285 
    286     def close(self):

~\AppData\Local\Programs\Python\Python36\lib\site-packages\gym\envs\classic_control\cartpole.py in render(self, mode)
    135         self.poletrans.set_rotation(-x[2])
    136 
--> 137         return self.viewer.render(return_rgb_array = mode=='rgb_array')
    138 
    139     def close(self):

~\AppData\Local\Programs\Python\Python36\lib\site-packages\gym\envs\classic_control\rendering.py in render(self, return_rgb_array)
     82         glClearColor(1,1,1,1)
     83         self.window.clear()
---> 84         self.window.switch_to()
     85         self.window.dispatch_events()
     86         self.transform.enable()

~\AppData\Local\Programs\Python\Python36\lib\site-packages\pyglet\window\win32\__init__.py in switch_to(self)
    315 
    316     def switch_to(self):
--> 317         self.context.set_current()
    318 
    319     def flip(self):

AttributeError: 'NoneType' object has no attribute 'set_current'

我的计算机规格为：惠普Pavilion dv7 4080us CPU：Intel Core i7 Q720 1.60GHz 4核心8处理器 8GB RAM AMD Mobility Radeon HD 5000系列 Tensorflow 1.4.0 OpenAi体育馆0.10.5 Python 3.6.5

感谢您的帮助！

Answer 1

要解决此问题，请在您的1.2.4，pyglet或virtualenv环境中安装conda的{{1}}版本python（取决于您使用的是什么）。在终端中运行：

pip install pyglet==1.2.4

这将使env.render()能够显示模拟的环境。

也请在此StackOverflow post

中签出接受的答案

Tensorflow Openai体育馆训练错误

单元格1：

单元格2：

单元格3：

单元格4：

单元格5：

单元格6：

CELL 7：

1 个答案: