带有Tensorflow keras的DQN opengym cart-pole-v0:模型不收敛

时间:2019-08-17 17:35:07

标签: python tensorflow keras reinforcement-learning openai-gym

我尝试使用具有经验重播和epsilon衰减的DQN来解决openai健身房中的标准小插曲示例,但我似乎无法使其收敛,实际上,损失呈指数级增长?

我尝试查看此示例,但是我的代码和他的代码之间并没有太大的区别。 https://github.com/NoumaanKaleem/cartpole_ddqn/blob/master/deep-q-learning/dqn.py

这是我的代码:


import tensorflow as tf
import numpy as np

#self.mem.add([state,at,reward,next_state,done])
class SARSD:
    count = 0
    def __init__(self, state, at, reward, next_state, done):
        self.state = state
        self.at = at
        self.reward = reward
        self.next_state = next_state
        self.done = done
        self.ID = SARSD.count
        SARSD.count = SARSD.count + 1

class ReplayMemory:
    def __init__(self, N):
        self.mem = []
        self.N = N

    #add element to replay memory
    #if the size of the replay memory is alrady capped, remove first item added
    def add(self, elem):
        self.mem.append(elem)
        if len(self.mem) > self.N:
            self.mem.pop(0)

    def sample(self, num_elems):
        mem_length = len(self.mem)
        if num_elems > mem_length:
            return -1

        indices = np.random.randint(0,mem_length-1,num_elems)

        new_array = []
        to_remove = []
        for i in indices:
            new_array.append(self.mem[i])
            to_remove.append(self.mem[i])

        self.mem = [value for value in self.mem if value not in to_remove]
        return new_array

    def full(self):
        return len(self.mem) == self.N

    def clear(self):
        self.mem.clear()


class DQNAgent:
    def __init__(self):
        self.mem = ReplayMemory(2048)

        #epsilon greedy parameter
        self.eps = 0.1

        #learning rate
        self.gamma = 0.95

        #minibatch size
        self.minibatch_size = 32

        #epochs
        self.epochs = 1

    def learn(self, num_episodes, env, model):
        self.mem = ReplayMemory(1000)

        for episode in range(0,num_episodes):

            #if episode % int(num_episodes/10) == 0:
                #self.eps = self.eps - 0.2
            #if self.eps < 0.05:
                #self.eps = 0.05

            state = env.reset()
            state = np.reshape(state, [1,4])
            done = False

            while done == False:
                at = 0
                if np.random.rand() < self.eps:
                    #sample a random action
                    at = env.action_space.sample()
                else:
                    #get the best action from policy
                    test = model.predict(state)
                    at = np.argmax(model.predict(state))

                #execute action at
                next_state, reward, done, info = env.step(at)
                next_state = np.reshape(next_state, [1,4])

                #store the transition into replay memory
                data = SARSD(state,at,reward,next_state,done)
                self.mem.add(data)

                #EDIT1 : surely this wasn't helping
                state = next_state

                if self.mem.full():
                    #sample batch of transitions from replay memory
                    minibatch = self.mem.sample(self.minibatch_size)

                    #intialize the targets and data
                    y = []
                    x = []

                    for elem in minibatch:
                        #check if terminal state
                        terminal = elem.done

                        if terminal == True:
                            yj = elem.reward
                        else:
                            #do the discounted reward formula
                            yj = elem.reward
                            prediction = model.predict(elem.next_state)
                            tmp = np.max(model.predict(elem.next_state))
                            yj = yj + self.gamma * np.max(model.predict(elem.next_state))

                        #get the prediction
                        prediction = model.predict(elem.state)

                        #change the prediction action value with discounted reward
                        prediction[0][elem.at] = yj

                        y.append(prediction[0])
                        x.append(elem.state[0])

                    #fit the model with target y
                    x = np.array(x)
                    y = np.array(y)
                    model.fit(x,y,epochs=self.epochs)

这是我声明要训练的keras模型的主要python文件...


import numpy as np
import gym
import DQNAgent

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

env = gym.make('CartPole-v0')
agent = DQNAgent.DQNAgent()

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(24, activation=tf.keras.activations.sigmoid, input_dim=4))
model.add(tf.keras.layers.Dense(24, activation=tf.keras.activations.sigmoid))
model.add(tf.keras.layers.Dense(2,activation=tf.keras.activations.linear))

model.compile(optimizer=tf.train.AdamOptimizer(0.01),
              loss=tf.keras.losses.mean_squared_error)

agent.learn(1000,env,model)

#evaluate if model has been trained correctly

episode = 0
for episode in range(0,1000):
    next_state = env.reset()


    done = False
    while done == False:
        env.render()

        next_state = np.reshape(next_state, [1, 4])
        at = np.argmax(model.predict(next_state))

        next_state, reward, done, info = env.step(at)

您可能会猜到,我对Tensorflow / Keras很陌生。

编辑1:忘记实际分配state = next_state。这有助于使损失稳定下来,但仍不能解决棘手问题...

0 个答案:

没有答案