我正在尝试使用tensorflow训练基本的openai卡特尔环境...
这是我试图运行的代码(由单元格分隔)
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline
env = gym.make('CartPole-v1')
print(env.observation_space)
print(env.action_space)
games_to_play = 10
for i in range(games_to_play):
# Reset the environment
obs = env.reset()
episode_rewards = 0
done = False
while not done:
# Render the environment so we can watch
env.render()
# Choose a random action
action = env.action_space.sample()
# Take a step in the environment with the chosen action
obs, reward, done, info = env.step(action)
episode_rewards += reward
# Print episode total rewards when done
print(episode_rewards)
# Close the environment
env.close()
class Agent:
def __init__(self, num_actions, state_size):
initializer = tf.contrib.layers.xavier_initializer()
self.input_layer = tf.placeholder(dtype=tf.float32, shape=[None,
state_size])
# Neural net starts here
hidden_layer = tf.layers.dense(self.input_layer, 8,
activation=tf.nn.relu, kernel_initializer=initializer)
hidden_layer_2 = tf.layers.dense(hidden_layer, 8,
activation=tf.nn.relu, kernel_initializer=initializer)
# Output of neural net
out = tf.layers.dense(hidden_layer_2, num_actions,
activation=None)
self.outputs = tf.nn.softmax(out)
self.choice = tf.argmax(self.outputs, axis=1)
# Training Procedure
self.rewards = tf.placeholder(shape=[None, ], dtype=tf.float32)
self.actions = tf.placeholder(shape=[None, ], dtype=tf.int32)
one_hot_actions = tf.one_hot(self.actions, num_actions)
cross_entropy =
tf.nn.softmax_cross_entropy_with_logits(logits=out,
labels=one_hot_actions)
self.loss = tf.reduce_mean(cross_entropy * self.rewards)
self.gradients = tf.gradients(self.loss, tf.trainable_variables())
# Create a placeholder list for gradients
self.gradients_to_apply = []
for index, variable in enumerate(tf.trainable_variables()):
gradient_placeholder = tf.placeholder(tf.float32)
self.gradients_to_apply.append(gradient_placeholder)
# Create the operation to update gradients with the gradients
placeholder.
optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
self.update_gradients =
optimizer.apply_gradients(zip(self.gradients_to_apply,
tf.trainable_variables()))
discount_rate = 0.95
def discount_normalize_rewards(rewards):
discounted_rewards = np.zeros_like(rewards)
total_rewards = 0
for i in reversed(range(len(rewards))):
total_rewards = total_rewards * discount_rate + rewards[i]
discounted_rewards[i] = total_rewards
discounted_rewards -= np.mean(discounted_rewards)
discounted_rewards /= np.std(discounted_rewards)
return discounted_r
tf.reset_default_graph()
# Modify these to match shape of actions and states in your environment
num_actions = 2
state_size = 4
path = "./cartpole-pg/"
training_episodes = 1000
max_steps_per_episode = 10000
episode_batch_size = 5
agent = Agent(num_actions, state_size)
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=2)
if not os.path.exists(path):
os.makedirs(path)
with tf.Session() as sess:
sess.run(init)
total_episode_rewards = []
# Create a buffer of 0'd gradients
gradient_buffer = sess.run(tf.trainable_variables())
for index, gradient in enumerate(gradient_buffer):
gradient_buffer[index] = gradient * 0
for episode in range(training_episodes):
state = env.reset()
episode_history = []
episode_rewards = 0
for step in range(max_steps_per_episode):
if episode % 10 == 0:
env.render()
# Get weights for each action
action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [state]})
action_choice = np.random.choice(range(num_actions), p=action_probabilities[0])
state_next, reward, done, _ = env.step(action_choice)
episode_history.append([state, action_choice, reward, state_next])
state = state_next
episode_rewards += reward
if done or step + 1 == max_steps_per_episode:
total_episode_rewards.append(episode_rewards)
episode_history = np.array(episode_history)
episode_history[:,2] = discount_normalize_rewards(episode_history[:,2])
ep_gradients = sess.run(agent.gradients, feed_dict={agent.input_layer: np.vstack(episode_history[:, 0]),
agent.actions: episode_history[:, 1],
agent.rewards: episode_history[:, 2]})
# add the gradients to the grad buffer:
for index, gradient in enumerate(ep_gradients):
gradient_buffer[index] += gradient
break
if episode % episode_batch_size == 0:
feed_dict_gradients = dict(zip(agent.gradients_to_apply, gradient_buffer))
sess.run(agent.update_gradients, feed_dict=feed_dict_gradients)
for index, gradient in enumerate(gradient_buffer):
gradient_buffer[index] = gradient * 0
if episode % 10 == 0:
saver.save(sess, path + "pg-checkpoint", episode)
print("Average reward / 100 eps: " + str(np.mean(total_episode_rewards[-100:])))
然后在运行该单元格后出现此错误:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-13-e284342309c6> in <module>()
41
42 if episode % 10 == 0:
---> 43 env.render()
44
45 # Get weights for each action
~\AppData\Local\Programs\Python\Python36\lib\site-packages\gym\core.py in render(self, mode)
282
283 def render(self, mode='human'):
--> 284 return self.env.render(mode)
285
286 def close(self):
~\AppData\Local\Programs\Python\Python36\lib\site-packages\gym\envs\classic_control\cartpole.py in render(self, mode)
135 self.poletrans.set_rotation(-x[2])
136
--> 137 return self.viewer.render(return_rgb_array = mode=='rgb_array')
138
139 def close(self):
~\AppData\Local\Programs\Python\Python36\lib\site-packages\gym\envs\classic_control\rendering.py in render(self, return_rgb_array)
82 glClearColor(1,1,1,1)
83 self.window.clear()
---> 84 self.window.switch_to()
85 self.window.dispatch_events()
86 self.transform.enable()
~\AppData\Local\Programs\Python\Python36\lib\site-packages\pyglet\window\win32\__init__.py in switch_to(self)
315
316 def switch_to(self):
--> 317 self.context.set_current()
318
319 def flip(self):
AttributeError: 'NoneType' object has no attribute 'set_current'
我的计算机规格为: 惠普Pavilion dv7 4080us CPU:Intel Core i7 Q720 1.60GHz 4核心8处理器 8GB RAM AMD Mobility Radeon HD 5000系列 Tensorflow 1.4.0 OpenAi体育馆0.10.5 Python 3.6.5
感谢您的帮助!
答案 0 :(得分:0)
要解决此问题,请在您的1.2.4
,pyglet
或virtualenv
环境中安装conda
的{{1}}版本python
(取决于您使用的是什么) 。在终端中运行:
pip install pyglet==1.2.4
这将使env.render()
能够显示模拟的环境。