我的深入确定性策略梯度模型即使经过2000次迭代也无法学习任何东西

时间:2019-06-19 21:31:39

标签: python artificial-intelligence reinforcement-learning

我尝试了不同的超参数以及层和节点的数量,但是即使在2000年迭代之后,我的模型也没有学习任何东西,并且我还尝试了MountainCarContinuous-v0环境,但这也没有用。

我尝试了与github不同的体系结构和模型,但是我的模型仍然没有学到任何东西

import numpy as np
import tensorflow as tf
import random
import gym
import pylab
import sys

from keras.initializers import RandomUniform
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.layers import Dense, Input, Add, Concatenate, Flatten, GaussianNoise, Lambda
from keras import backend as K
from collections import deque

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

EPISODES = 100000

class Actor(object):

    def __init__(self, sess, state_size, action_size, TAU, lr, action_bound=1, load=False):
        self.sess = sess
        self.TAU = TAU
        self.lr = lr
        self.load = load
        self.action_bound = action_bound

        self.model, self.weights, self.state = self.bulid_network(state_size, action_size)
        self.target_model, self.target_weights, self.target_weights = self.bulid_network(state_size, action_size)
        self.q_grads = tf.placeholder(tf.float32, [None, action_size])
        self.grads = tf.gradients(self.model.output, self.weights, -self.q_grads)
        self.optimize = tf.train.AdamOptimizer(lr).apply_gradients(zip(self.grads, self.weights))
        self.sess.run(tf.global_variables_initializer())

        if self.load:
            self.model.load_weights("./DDPG_Actor.h5")
            self.target_model.load_weights("./DDPG_Actor_target.h5")

    def train(self, state, grads):
        self.sess.run(self.optimize, feed_dict={self.state : state, self.q_grads : grads})

    def update(self):
        W, target_W = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(W)):
            target_W[i] = self.TAU*W[i] + (1 - self.TAU)*target_W[i]
        self.target_model.set_weights(target_W)

    def save(self):
        self.model.save_weights("./DDPG_Actor.h5")
        self.target_model.save_weights("./DDPG_Actor_target.h5")

    def bulid_network(self, state_size, action_size):
        input = Input(shape=[state_size])
        X = Dense(400, activation='relu', kernel_initializer='glorot_normal')(input)
        X = Dense(300, activation='relu', kernel_initializer='glorot_normal')(X)
        output = Dense(action_size, activation='tanh', kernel_initializer='glorot_normal')(X)
        output = Lambda(lambda i : i*self.action_bound)(output)
        model = Model(input=input, output=output)
        return model, model.trainable_weights, input 


class Critic(object):

    def __init__(self, sess, state_size, action_size, TAU, lr, load=False):
        self.sess = sess
        self.TAU = TAU
        self.lr = lr
        self.load = load
        self.optimizer = tf.train.AdamOptimizer(lr)

        self.model, self.state, self.action = self.build_network(state_size, action_size)
        self.target_model, self.target_state, self.target_action = self.build_network(state_size, action_size)
        self.q_grads = tf.gradients(self.model.output, self.action)
        self.sess.run(tf.global_variables_initializer())

        if self.load:
            self.model.load_weights("./DDPG_Critic.h5")
            self.target_model.load_weights("./DDPG_Critic_target.h5")

    def gradients(self, state, action):
        return self.sess.run(self.q_grads, feed_dict={self.state : state, self.action : action})[0]

    def save(self):
        self.model.save_weights("./DDPG_Critic.h5")
        self.target_model.save_weights("./DDPG_Critic_target.h5")

    def update(self):
        W, target_W = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(W)):
            target_W[i] = self.TAU*W[i] + (1 - self.TAU)*target_W[i]
        self.target_model.set_weights(target_W)        

    def build_network(self, state_size, action_size):
        S = Input(shape=[state_size])
        A = Input(shape=[action_size])
        X1 = Dense(400, activation='relu', kernel_initializer='glorot_normal')(S)
        X2 = Dense(400, activation='relu', kernel_initializer='glorot_normal')(A)
        X = Add()([X1,X2])
        X = Dense(300, activation='relu', kernel_initializer='glorot_normal')(X)
        output = Dense(action_size, activation='linear', kernel_initializer='glorot_normal')(X)
        model = Model(inputs=[S, A], outputs=output)
        model.compile(loss='mse', optimizer=Adam(lr=self.lr))
        return model, S, A


class DDPG(object):
    def __init__(self, sess, state_size, action_size, action_bound=1, memory_size=5000, batch_size=64, actor_lr=0.0001, critic_lr=0.001, gamma=0.99, TAU=0.001):
        self.sess = sess
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.TAU = TAU
        self.train_start = 1000
        self.epsilon =  1
        self.epsilon_min = 0.001
        self.mu = 0.0
        self.x = 0
        self.theta = 0.01
        self.sigma = 0.1
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 100000


        self.actor = Actor(sess, state_size, action_size, TAU, actor_lr,action_bound, load=False)
        self.critic = Critic(sess, state_size, action_size, TAU, critic_lr, load=False)

    def append(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min :
          self.epsilon -= self.epsilon_decay

    def OU(self):
        dx = self.theta*(self.mu - self.x) + self.sigma*np.random.randn(1)
        self.x += dx
        return self.x

    def get_action(self, state):
        action =  self.actor.model.predict(state)[0]
        return action + self.OU()*self.epsilon

    def save(self):
        self.actor.save()
        self.critic.save()

    def train(self):
        if len(self.memory) < self.train_start:
            return

        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        states = np.asarray([e[0] for e in mini_batch])
        states = np.reshape(states, [batch_size, self.state_size])
        actions = np.asarray([e[1] for e in mini_batch])
        rewards = np.asarray([e[2] for e in mini_batch])
        next_states = np.asarray([e[3] for e in mini_batch])
        next_states = np.reshape(next_states, [batch_size, self.state_size])
        done = np.asarray([e[4] for e in mini_batch])
        target = np.zeros_like(actions)

        target_q_values = self.critic.target_model.predict([next_states, self.actor.target_model.predict(next_states)])
        for i in range(len(mini_batch)):
            if done[i]:
                target[i] = rewards[i]

            else :
                target[i] = rewards[i] + self.gamma*target_q_values[i]

        loss = self.critic.model.train_on_batch([states, actions], target)
        action_for_grad = self.actor.model.predict(states)
        q_grads = self.critic.gradients(states,action_for_grad)
        self.actor.train(states,q_grads)
        self.actor.update()
        self.critic.update()


env = gym.make('Pendulum-v0')

state_size = env.observation_space.shape[0]
action_size = 1
action_bound = env.action_space.high
agent = DDPG(sess, state_size, action_size, action_bound)


scores, episodes = [], []

for e in range(EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    step = 0

    while not done:

        action = agent.get_action(state)
        #print(action)
        next_state, reward, done, _ = env.step([action])
        next_state = np.reshape(next_state,[1,state_size])

        score += reward[0]
        agent.append(state, action, reward, next_state, done)
        state = next_state

        step += 1
        if step % 20 == 0:
          agent.train()

        if done:

            scores.append(score)
            episodes.append(e)
            pylab.plot(episodes, scores, 'b')
            pylab.savefig("./DDPG_Pendulum.png")
            print("episode:", e, " score:", score, " epsilon:", agent.epsilon)

            #if np.mean(scores[-min(10, len(scores)) :]) > -120 :
                #sys.exit()


    if e % 50 == 0:
        agent.save()

我每集总能获得-1450的奖励

0 个答案:

没有答案