如何最大程度地减少用于训练Atari Deep Q学习模型的RAM使用

时间:2019-10-07 13:39:32

标签: python tensorflow machine-learning reinforcement-learning

我正在尝试在Google Colab(RAM-12.73 GB)上为Atari训练DQN。在Training期间,培训停止并显示消息“您的会话在使用所有可用RAM后崩溃”。

以下是座席类

class my_agent:
def __init__(self,env):
    self.env = env
    self.state_input = Input((105,80,1), name='state_input')
    #self.action_input = action
    self.gamma = 0.6
    self.epsilon = 0.1
    self.epsilon_decay = 0.00001 
    self.memory = deque(maxlen=50000)
    self.q_net = self.main_model()
    self.target_net = self.main_model()
    self.alighn_target_model()

def main_model(self):
    x = Conv2D(16,8,(4,4), activation='relu')(self.state_input)
    x = Conv2D(32,4,(2,2), activation = 'relu')(x)
    x = Flatten()(x)
    x = Dense(256)(x)
    y = Dense(env.action_space.n)(x)
    #q_val = tf.reduce_sum(tf.multiply(y, self.actions_input))
    model = Model(inputs=[self.state_input], outputs= y)
    optimizer = Adam(learning_rate=0.01)
    huber = Huber()
    model.compile(optimizer, loss=huber)
    return model



def store(self, state, action, reward, next_state, terminated):
    self.memory.append((state, action, reward, next_state, terminated))


def _update_epsilon(self):
    self.epsilon -= self.epsilon_decay
    self.epsilon = max(self.epsilon_min, self.epsilon)



def alighn_target_model(self):
    self.target_net.set_weights(self.q_net.get_weights())     


def act(self, state):

    if np.random.rand() <= self.epsilon:
        return self.env.action_space.sample()

    state= state        
    q_values = self.q_net.predict(state)
    a = np.argmax(q_values[0])
    #print(a)
    return np.array([a])


def retrain(self, batch_size):
    minibatch = random.sample(self.memory, batch_size)

    for state, action, reward, next_state, terminated in minibatch:

        state = state  

        next_state = next_state


        target = self.q_net.predict(state)

        if terminated:
            target[0][action] = reward
        else:
            t = self.target_net.predict(next_state)
            target[0][action] = reward + self.gamma * np.amax(t)

        self.q_net.fit(state, target, epochs=1, verbose=0)

以下是训练循环:

for e in tqdm.tqdm(range(0, num_of_episodes)):

state = env.reset()
state , stacked_frames = stack_state(stacked_frames, state, True)

reward = 0
terminated = False

for timestep in range(timesteps_per_episode):
    env.render()
    state = state 

    action = agent.act(state)
    #action = np.array([1])
    print(action)

    next_state, reward, terminated, info = env.step(action)
    next_state , stacked_frames = stack_state(stacked_frames, next_state , False)

    agent.store(state, action, reward, next_state, terminated)

    state  = next_state


    if terminated:

        print("Total reward is {}".format(rewards))
        agent.alighn_target_model()
        break

    if len(agent.memory) > batch_size:
        agent.retrain(batch_size)

总情节为1000,每个情节的步数为1000 训练在第一集本身中停止。

https://github.com/abhisheksuran/Atari_DQN/blob/master/Atari_DQN_image.ipynb

0 个答案:

没有答案