强化学习代理中没有学习

时间:2018-03-24 18:42:25

标签: python tensorflow keras reinforcement-learning

我正在尝试在tensorflow和keras中实现深度确定性梯度方法,但是,我似乎被卡住了。似乎没有学习发生,模型采取的行动似乎根本没有改变,并且应用于演员网络的梯度也非常小(在e ^ -5的数量级上)。我已经使用了另一个实现作为参考,并且这个实现与完全相同的超参数和网络体系结构运行良好(除了它使用tflearn实现并包括批量规范化层),让我相信我的代码中存在某个错误。也许有人可以发现它。 谢谢你的时间!

编辑:我认为表现糟糕的原因是评论家网络在行动方面的渐变正在消失。但是,我无法找出原因。也许我使用连接层是错误的?

class AIInterface(object):

def __init__(self, sim):

    self.sim = sim

    self.pedal_pos = 0
    self.steering_pos = 0

    self.sess = tf.Session()

    self.learning_rate = 10e-4
    self.BATCH_SIZE = 64
    self.epsilon = .75 #amount of random exploration
    self.epsilon_decay = .997
    self.gamma = .99 #reward discount factor
    self.tau = .00125 #target update factor

    self.rewards = deque(maxlen=100000)
    self.memory = deque(maxlen=100000)

    # Actor stuff
    self.actor_model, self.actor_var_in = self.initialize_actor()
    self.target_actor, _ = self.initialize_actor()

    self.actor_critic_grad = tf.placeholder(tf.float32, [None, 1])

    self.actor_model_weights = self.actor_model.trainable_weights

    with tf.name_scope("actor_gradients"):
        self.actor_grads = tf.gradients(self.actor_model.output, self.actor_model_weights, -self.actor_critic_grad)
        self.normalized_actor_grads = list(map(lambda x: tf.div(x, self.BATCH_SIZE), self.actor_grads))

    grads = zip(self.normalized_actor_grads, self.actor_model_weights)
    self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)

    # Critic stuff
    self.critic_model, self.critic_var_in, self.critic_action_in = self.initialize_critic()
    self.target_critic, _, _ = self.initialize_critic()

    with tf.name_scope("CriticGrads"):
        self.critic_grads = tf.gradients(self.critic_model.output, self.critic_action_in)

    self.sess.run(tf.global_variables_initializer())

    self.target_actor.set_weights(self.actor_model.get_weights())
    self.target_critic.set_weights(self.critic_model.get_weights())

    self.global_step = 0

def initialize_actor(self):

    state_variable_input = Input(shape=(3, ))

    init = TruncatedNormal(mean=0.0, stddev=0.02)

    dense = Dense(128, activation="relu", kernel_initializer=init)(state_variable_input)
    dense2 = Dense(128, activation="relu", kernel_initializer=init)(dense)

    output = Dense(1, activation="tanh", kernel_initializer=RandomUniform(-3e-3, 3e-3))(dense2)

    model = Model(inputs=state_variable_input,
                  outputs=output)

    model.compile(optimizer="adam", loss="mse")

    return model, state_variable_input


def initialize_critic(self):

    state_variable_input = Input(shape=(3, ))
    action_input = Input(shape=(1, ))

    init = TruncatedNormal(mean=0.0, stddev=0.02)
    dense_state = Dense(128, activation="relu", kernel_initializer=init)(state_variable_input)

    merge = Concatenate()([dense_state, action_input])
    dense2 = Dense(128, activation="relu", kernel_initializer=init)(merge)

    output = Dense(1, activation="linear", kernel_initializer=RandomUniform(-3e-3, 3e-3))(dense2)

    model = Model(inputs=[state_variable_input, action_input],
                  outputs=output)

    model.compile(optimizer="adam", loss="mse")

    return model, state_variable_input, action_input


def train(self):

    if len(self.memory) < self.BATCH_SIZE:
        return

    samples = random.sample(self.memory, self.BATCH_SIZE)

    samples = [np.concatenate(x) for x in zip(*samples)]

    self.train_critic(samples)
    self.train_actor(samples)
    self.global_step += 1


def train_critic(self, samples):

    cur_state_var, action, reward, new_state_var = samples

    predicted_action = self.target_actor.predict([new_state_var])

    future_reward = self.target_critic.predict([new_state_var, predicted_action])

    Q = reward + self.gamma*future_reward

    self.critic_model.train_on_batch([cur_state_var,  action], Q)


def train_actor(self, samples):

    cur_state_var, action, reward, new_state_var = samples
    predicted_action = self.actor_model.predict([cur_state_var])

    grads = self.sess.run([self.critic_grads], feed_dict={
        self.critic_var_in: cur_state_var,
        self.critic_action_in: predicted_action})

    self.sess.run(self.optimize, feed_dict={
        self.actor_var_in: cur_state_var,
        self.actor_critic_grad: grads[0]
        })

def update_actor_target(self):

    actor_model_weights = self.actor_model.get_weights()
    actor_target_weights = self.target_actor.get_weights()

    for i in range(len(actor_target_weights)):
        actor_target_weights[i] = self.tau * actor_model_weights[i] + (1-self.tau)*actor_target_weights[i]
    self.target_actor.set_weights(actor_target_weights)

def update_critic_target(self):

    critic_model_weights = self.critic_model.get_weights()
    critic_target_weights = self.target_critic.get_weights()

    for i in range(len(critic_target_weights)):
        critic_target_weights[i] = self.tau * critic_model_weights[i] + (1-self.tau)*critic_target_weights[i]
    self.target_critic.set_weights(critic_target_weights)

def update_model(self):

    self.update_actor_target()
    self.update_critic_target()


def act(self, cur_state_var, noise=None, env=None):

    if env:
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        else:
            sh = cur_state_var.shape
            action = self.actor_model.predict([cur_state_var], batch_size=1)[0]

            return action

    elif not noise:
        if np.random.random() < self.epsilon:
            return self.sample_action_space()
        return self.actor_model.predict([cur_state_var], batch_size=1)[0]
    else:
        no = noise()
        pred = self.actor_model.predict([cur_state_var], batch_size=1)[0]

        return pred + no

def sample_action_space(self):

    return np.array([random.uniform(-0.5, 0.5), random.uniform(-1.0, 1.0)]).reshape(2, )

def remember(self, cur_state_var, action, reward, new_state_var):

    self.memory.append([cur_state_var, action, reward,  new_state_var])

0 个答案:

没有答案