Actor批判模型中的错误:ValueError:Denso具有2个维度,但数组的形状为()

时间:2018-12-31 13:12:30

标签: machine-learning reinforcement-learning

出于学习目的,我从github复制了一个脚本。使用来自Open AI Gym的摆锤任务时,演员评论模型起作用。但是,当我将任务转移到“连续山地汽车实验”时,我收到了一个我不理解的错误。

有人知道我应该做什么吗?

请参见下面的代码。在底部,有主要功能。当我使用“摆锤任务”时,一切正常,但是当我将其更改为“连续山地车实验”时,会收到错误消息。

import gym
import numpy as np 
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input
from keras.layers.merge import Add, Multiply
from keras.optimizers import Adam
import keras.backend as K

import tensorflow as tf

import random
from collections import deque

# determines how to assign values to each state, i.e. takes the state
# and action (two-input model) and determines the corresponding value
class ActorCritic:
def __init__(self, env, sess):
    self.env  = env
    self.sess = sess

    self.learning_rate = 0.001
    self.epsilon = 1.0
    self.epsilon_decay = .995
    self.gamma = .95
    self.tau   = .125

    # ===================================================================== #
    #                               Actor Model                             


    self.memory = deque(maxlen=2000)
    self.actor_state_input, self.actor_model = self.create_actor_model()
    _, self.target_actor_model = self.create_actor_model()

    self.actor_critic_grad = tf.placeholder(tf.float32, 
        [None, self.env.action_space.shape[0]]) # where we will feed de/dC (from critic)

    actor_model_weights = self.actor_model.trainable_weights
    self.actor_grads = tf.gradients(self.actor_model.output, 
    actor_model_weights, -self.actor_critic_grad) # dC/dA (from actor)
    grads = zip(self.actor_grads, actor_model_weights)
    self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)

    # ===================================================================== #
    #                              Critic Model                             #   

    self.critic_state_input, self.critic_action_input, \
        self.critic_model = self.create_critic_model()
    _, _, self.target_critic_model = self.create_critic_model()

    self.critic_grads = tf.gradients(self.critic_model.output, 
        self.critic_action_input) # where we calcaulte de/dC for feeding above

    # Initialize for later gradient calculations
    self.sess.run(tf.global_variables_initializer())

# ========================================================================= #
#                              Model Definitions                            #


def create_actor_model(self):
    state_input = Input(shape=self.env.observation_space.shape)
    h1 = Dense(24, activation='relu')(state_input)
    h2 = Dense(48, activation='relu')(h1)
    h3 = Dense(24, activation='relu')(h2)
    output = Dense(self.env.action_space.shape[0], activation='relu')(h3)

    model = Model(inputs=state_input, outputs=output)
    print('--------- Actor Model --------- ')
    model.summary()
    adam  = Adam(lr=0.001)
    model.compile(loss="mse", optimizer=adam)
    return state_input, model

def create_critic_model(self):
    state_input = Input(shape=self.env.observation_space.shape)
    state_h1 = Dense(24, activation='relu')(state_input)
    state_h2 = Dense(48)(state_h1)

    action_input = Input(shape=self.env.action_space.shape)
    action_h1    = Dense(48)(action_input)

    merged    = Add()([state_h2, action_h1])
    merged_h1 = Dense(24, activation='relu')(merged)
    output = Dense(1, activation='relu')(merged_h1)
    print('--------- Critic Model --------- ')
    model  = Model(inputs=[state_input,action_input], outputs=output)
    model.summary()

    adam  = Adam(lr=0.001)
    model.compile(loss="mse", optimizer=adam)
    return state_input, action_input, model

# ========================================================================= #
#                               Model Training                              #

def remember(self, cur_state, action, reward, new_state, done):
    self.memory.append([cur_state, action, reward, new_state, done])

def _train_actor(self, samples):
    for sample in samples:
        cur_state, action, reward, new_state, _ = sample
        predicted_action = self.actor_model.predict(cur_state)
        grads = self.sess.run(self.critic_grads, feed_dict={
            self.critic_state_input:  cur_state,
            self.critic_action_input: predicted_action
        })[0]

        self.sess.run(self.optimize, feed_dict={
            self.actor_state_input: cur_state,
            self.actor_critic_grad: grads
        })

def _train_critic(self, samples):
    for sample in samples:
        cur_state, action, reward, new_state, done = sample
        print(cur_state, action, reward, new_state, done)
        if not done:
            target_action = self.target_actor_model.predict(new_state)
            future_reward = self.target_critic_model.predict(
                [new_state, target_action])[0][0]
            reward += self.gamma * future_reward
            print(cur_state)
            print(action)
        self.critic_model.fit([cur_state, action], reward, verbose=0)

def train(self):
    batch_size = 32
    if len(self.memory) < batch_size:            
        return

    rewards = []
    samples = random.sample(self.memory, batch_size)
    self._train_critic(samples)
    self._train_actor(samples)

# ========================================================================= #
#                         Target Model Updating                             #

def _update_actor_target(self):
    actor_model_weights  = self.actor_model.get_weights()
    actor_target_weights = self.target_critic_model.get_weights()

    for i in range(len(actor_target_weights)):
        actor_target_weights[i] = actor_model_weights[i]
    self.target_critic_model.set_weights(actor_target_weights)

def _update_critic_target(self):
    critic_model_weights  = self.critic_model.get_weights()
    critic_target_weights = self.critic_target_model.get_weights()

    for i in range(len(critic_target_weights)):
        critic_target_weights[i] = critic_model_weights[i]
    self.critic_target_model.set_weights(critic_target_weights)     

def update_target(self):
    self._update_actor_target()
    self._update_critic_target()

# ========================================================================= #
#                              Model Predictions                            #

def act(self, cur_state):
    self.epsilon *= self.epsilon_decay
    if np.random.random() < self.epsilon:
        return self.env.action_space.sample()
    return self.actor_model.predict(cur_state)

def main():
    sess = tf.Session()
    K.set_session(sess)

#env = gym.make('MountainCarContinuous-v0') ---> This one does not work!
env = gym.make('Pendulum-v0')
actor_critic = ActorCritic(env, sess)

num_trials = 10000
trial_len  = 500

cur_state = env.reset()
action = env.action_space.sample()
while True:
    env.render()
    cur_state = cur_state.reshape((1, env.observation_space.shape[0]))
    action = actor_critic.act(cur_state)
    action = action.reshape((1, env.action_space.shape[0]))

    new_state, reward, done, _ = env.step(action)
    new_state = new_state.reshape((1, env.observation_space.shape[0]))

    actor_critic.remember(cur_state, action, reward, new_state, done)
    actor_critic.train()

    cur_state = new_state

if __name__ == "__main__":
main()

这是我收到的错误:

ValueError                                Traceback (most recent call last)
<ipython-input-41-5ae61cffb181> in <module>()
    204 
    205 if __name__ == "__main__":
--> 206         main()

<ipython-input-41-5ae61cffb181> in main()
    199 
    200         actor_critic.remember(cur_state, action, reward, new_state, done)
--> 201         actor_critic.train()
    202 
    203         cur_state = new_state

<ipython-input-41-5ae61cffb181> in train(self)
    140         rewards = []
    141         samples = random.sample(self.memory, batch_size)
--> 142         self._train_critic(samples)
    143         self._train_actor(samples)
    144 

<ipython-input-41-5ae61cffb181> in _train_critic(self, samples)
    131                 print(cur_state)
    132                 print(action)
--> 133             self.critic_model.fit([cur_state, action], reward, verbose=0)
    134 
    135     def train(self):

~\Anaconda3_5_2\envs\tensorflow\lib\site-packages\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
    948             sample_weight=sample_weight,
    949             class_weight=class_weight,
--> 950             batch_size=batch_size)
    951         # Prepare validation data.
    952         do_validation = False

~\Anaconda3_5_2\envs\tensorflow\lib\site-packages\keras\engine\training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, check_array_lengths, batch_size)
    785                 feed_output_shapes,
    786                 check_batch_axis=False,  # Don't enforce the batch size.
--> 787                 exception_prefix='target')
    788 
    789             # Generate sample-wise weight values given the `sample_weight` and

~\Anaconda3_5_2\envs\tensorflow\lib\site-packages\keras\engine\training_utils.py in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
    125                         ': expected ' + names[i] + ' to have ' +
    126                         str(len(shape)) + ' dimensions, but got array '
--> 127                         'with shape ' + str(data_shape))
    128                 if not check_batch_axis:
    129                     data_shape = data_shape[1:]

ValueError: Error when checking target: expected dense_247 to have 2 dimensions, but got array with shape ()

0 个答案:

没有答案