出于学习目的,我从github复制了一个脚本。使用来自Open AI Gym的摆锤任务时,演员评论模型起作用。但是,当我将任务转移到“连续山地汽车实验”时,我收到了一个我不理解的错误。
有人知道我应该做什么吗?
请参见下面的代码。在底部,有主要功能。当我使用“摆锤任务”时,一切正常,但是当我将其更改为“连续山地车实验”时,会收到错误消息。
import gym
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input
from keras.layers.merge import Add, Multiply
from keras.optimizers import Adam
import keras.backend as K
import tensorflow as tf
import random
from collections import deque
# determines how to assign values to each state, i.e. takes the state
# and action (two-input model) and determines the corresponding value
class ActorCritic:
def __init__(self, env, sess):
self.env = env
self.sess = sess
self.learning_rate = 0.001
self.epsilon = 1.0
self.epsilon_decay = .995
self.gamma = .95
self.tau = .125
# ===================================================================== #
# Actor Model
self.memory = deque(maxlen=2000)
self.actor_state_input, self.actor_model = self.create_actor_model()
_, self.target_actor_model = self.create_actor_model()
self.actor_critic_grad = tf.placeholder(tf.float32,
[None, self.env.action_space.shape[0]]) # where we will feed de/dC (from critic)
actor_model_weights = self.actor_model.trainable_weights
self.actor_grads = tf.gradients(self.actor_model.output,
actor_model_weights, -self.actor_critic_grad) # dC/dA (from actor)
grads = zip(self.actor_grads, actor_model_weights)
self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)
# ===================================================================== #
# Critic Model #
self.critic_state_input, self.critic_action_input, \
self.critic_model = self.create_critic_model()
_, _, self.target_critic_model = self.create_critic_model()
self.critic_grads = tf.gradients(self.critic_model.output,
self.critic_action_input) # where we calcaulte de/dC for feeding above
# Initialize for later gradient calculations
self.sess.run(tf.global_variables_initializer())
# ========================================================================= #
# Model Definitions #
def create_actor_model(self):
state_input = Input(shape=self.env.observation_space.shape)
h1 = Dense(24, activation='relu')(state_input)
h2 = Dense(48, activation='relu')(h1)
h3 = Dense(24, activation='relu')(h2)
output = Dense(self.env.action_space.shape[0], activation='relu')(h3)
model = Model(inputs=state_input, outputs=output)
print('--------- Actor Model --------- ')
model.summary()
adam = Adam(lr=0.001)
model.compile(loss="mse", optimizer=adam)
return state_input, model
def create_critic_model(self):
state_input = Input(shape=self.env.observation_space.shape)
state_h1 = Dense(24, activation='relu')(state_input)
state_h2 = Dense(48)(state_h1)
action_input = Input(shape=self.env.action_space.shape)
action_h1 = Dense(48)(action_input)
merged = Add()([state_h2, action_h1])
merged_h1 = Dense(24, activation='relu')(merged)
output = Dense(1, activation='relu')(merged_h1)
print('--------- Critic Model --------- ')
model = Model(inputs=[state_input,action_input], outputs=output)
model.summary()
adam = Adam(lr=0.001)
model.compile(loss="mse", optimizer=adam)
return state_input, action_input, model
# ========================================================================= #
# Model Training #
def remember(self, cur_state, action, reward, new_state, done):
self.memory.append([cur_state, action, reward, new_state, done])
def _train_actor(self, samples):
for sample in samples:
cur_state, action, reward, new_state, _ = sample
predicted_action = self.actor_model.predict(cur_state)
grads = self.sess.run(self.critic_grads, feed_dict={
self.critic_state_input: cur_state,
self.critic_action_input: predicted_action
})[0]
self.sess.run(self.optimize, feed_dict={
self.actor_state_input: cur_state,
self.actor_critic_grad: grads
})
def _train_critic(self, samples):
for sample in samples:
cur_state, action, reward, new_state, done = sample
print(cur_state, action, reward, new_state, done)
if not done:
target_action = self.target_actor_model.predict(new_state)
future_reward = self.target_critic_model.predict(
[new_state, target_action])[0][0]
reward += self.gamma * future_reward
print(cur_state)
print(action)
self.critic_model.fit([cur_state, action], reward, verbose=0)
def train(self):
batch_size = 32
if len(self.memory) < batch_size:
return
rewards = []
samples = random.sample(self.memory, batch_size)
self._train_critic(samples)
self._train_actor(samples)
# ========================================================================= #
# Target Model Updating #
def _update_actor_target(self):
actor_model_weights = self.actor_model.get_weights()
actor_target_weights = self.target_critic_model.get_weights()
for i in range(len(actor_target_weights)):
actor_target_weights[i] = actor_model_weights[i]
self.target_critic_model.set_weights(actor_target_weights)
def _update_critic_target(self):
critic_model_weights = self.critic_model.get_weights()
critic_target_weights = self.critic_target_model.get_weights()
for i in range(len(critic_target_weights)):
critic_target_weights[i] = critic_model_weights[i]
self.critic_target_model.set_weights(critic_target_weights)
def update_target(self):
self._update_actor_target()
self._update_critic_target()
# ========================================================================= #
# Model Predictions #
def act(self, cur_state):
self.epsilon *= self.epsilon_decay
if np.random.random() < self.epsilon:
return self.env.action_space.sample()
return self.actor_model.predict(cur_state)
def main():
sess = tf.Session()
K.set_session(sess)
#env = gym.make('MountainCarContinuous-v0') ---> This one does not work!
env = gym.make('Pendulum-v0')
actor_critic = ActorCritic(env, sess)
num_trials = 10000
trial_len = 500
cur_state = env.reset()
action = env.action_space.sample()
while True:
env.render()
cur_state = cur_state.reshape((1, env.observation_space.shape[0]))
action = actor_critic.act(cur_state)
action = action.reshape((1, env.action_space.shape[0]))
new_state, reward, done, _ = env.step(action)
new_state = new_state.reshape((1, env.observation_space.shape[0]))
actor_critic.remember(cur_state, action, reward, new_state, done)
actor_critic.train()
cur_state = new_state
if __name__ == "__main__":
main()
这是我收到的错误:
ValueError Traceback (most recent call last)
<ipython-input-41-5ae61cffb181> in <module>()
204
205 if __name__ == "__main__":
--> 206 main()
<ipython-input-41-5ae61cffb181> in main()
199
200 actor_critic.remember(cur_state, action, reward, new_state, done)
--> 201 actor_critic.train()
202
203 cur_state = new_state
<ipython-input-41-5ae61cffb181> in train(self)
140 rewards = []
141 samples = random.sample(self.memory, batch_size)
--> 142 self._train_critic(samples)
143 self._train_actor(samples)
144
<ipython-input-41-5ae61cffb181> in _train_critic(self, samples)
131 print(cur_state)
132 print(action)
--> 133 self.critic_model.fit([cur_state, action], reward, verbose=0)
134
135 def train(self):
~\Anaconda3_5_2\envs\tensorflow\lib\site-packages\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
948 sample_weight=sample_weight,
949 class_weight=class_weight,
--> 950 batch_size=batch_size)
951 # Prepare validation data.
952 do_validation = False
~\Anaconda3_5_2\envs\tensorflow\lib\site-packages\keras\engine\training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, check_array_lengths, batch_size)
785 feed_output_shapes,
786 check_batch_axis=False, # Don't enforce the batch size.
--> 787 exception_prefix='target')
788
789 # Generate sample-wise weight values given the `sample_weight` and
~\Anaconda3_5_2\envs\tensorflow\lib\site-packages\keras\engine\training_utils.py in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
125 ': expected ' + names[i] + ' to have ' +
126 str(len(shape)) + ' dimensions, but got array '
--> 127 'with shape ' + str(data_shape))
128 if not check_batch_axis:
129 data_shape = data_shape[1:]
ValueError: Error when checking target: expected dense_247 to have 2 dimensions, but got array with shape ()