Question

我编写了一个DQN脚本来播放BreakoutDeterministic并将其运行在我的学校GPU服务器上。但是，该代码似乎占用了总RAM内存的97％（超过100GB）！

我想知道脚本的哪一部分要求如此高的RAM使用率？我使用了memory-profiler进行了3集，看来笔记本电脑上的每个时间步长都对内存的需求呈线性增加。

我用python 3.6在PyCharm中编写了脚本。我的笔记本电脑12GB RAM，没有GPU，但学校服务器使用的是Ubuntu，p100 GPU。

import gym
import numpy as np
import random
from collections import deque
from keras.layers import Dense, Input, Lambda, convolutional, core
from keras.models import Model
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import os
import time as dt
plt.switch_backend('agg')

def preprocess(state):
    process_state = np.mean(state, axis=2).astype(np.uint8) 
    process_state = process_state[::2, ::2] 
    process_state_size = list(process_state.shape)
    process_state_size.append(1)
    process_state = np.reshape(process_state, process_state_size)
    return process_state


class DQNAgent:
    def __init__(self, env):
        self.env = env
        self.action_size = env.action_space.n
        self.state_size = self.select_state_size()

        self.memory = deque(maxlen=1000000)  # specify memory size
        self.gamma = 0.99
        self.eps = 1.0
        self.eps_min = 0.01
        self.decay = 0.95
        self.lr = 0.00025
        self.start_life = 5 # get from environment

        self.tau = 0.125  # special since 2 models to be trained


        self.model = self.create_cnnmodel()
        self.target_model = self.create_cnnmodel() 

    def select_state_size(self):
        process_state = preprocess(self.env.reset())
        state_size = process_state.shape
        return state_size

    def create_cnnmodel(self):

        data_input = Input(shape=self.state_size, name='data_input', dtype='int32')
        normalized = Lambda(lambda x: x/255)(data_input)  
        conv1 = convolutional.Convolution2D(32, 8, strides=(4, 4), activation='relu')(normalized)  
        conv2 = convolutional.Convolution2D(64, 4, strides=(2,2), activation='relu')(conv1)
        conv3 = convolutional.Convolution2D(64, 3, strides=(1,1), activation='relu')(conv2)
        conv_flatten = core.Flatten()(conv3)  # flatten to feed cnn to fc
        h4 = Dense(512, activation='relu')(conv_flatten)
        prediction_output = Dense(self.action_size, name='prediction_output', activation='linear')(h4)

        model = Model(inputs=data_input, outputs=prediction_output)
        model.compile(optimizer=Adam(lr=self.lr),
                  loss='mean_squared_error') # 'mean_squared_error') keras.losses.logcosh(y_true, y_pred)

        return model

    def remember(self, state, action, reward, new_state, done): # store past experience as a pre-defined table
        self.memory.append([state, action, reward, new_state, done])

    def replay(self, batch_size):
        if batch_size > len(self.memory):
            return

        all_states = []
        all_targets = []
        samples = random.sample(self.memory, batch_size)  
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)  
            if done:
                target[0][action] = reward  
            else:                   
                target[0][action] = reward + self.gamma*np.max(self.target_model.predict(new_state)[0])
            all_states.append(state)
            all_targets.append(target)
        history = self.model.fit(np.vstack(all_states), np.vstack(all_targets), epochs=1, verbose=0) 
        return history

    def act(self, state):
        self.eps *= self.decay  
        self.eps = max(self.eps_min, self.eps)
        if np.random.random() < self.eps:
            return self.env.action_space.sample()  
        return np.argmax(self.model.predict(state)[0])  

    def train_target(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = (1-self.tau)*target_weights[i] + self.tau*weights[i] 
        self.target_model.set_weights(target_weights) #


def main(episodes):

    env = gym.make('BreakoutDeterministic-v4')

    agent = DQNAgent(env, cnn)  
    time = env._max_episode_steps
    batch_size = 32

    save_model = 'y'

    filepath = os.getcwd() 
    date = dt.strftime('%d%m%Y')
    clock = dt.strftime('%H.%M.%S')

    print('++ Training started on {} at {} ++'.format(date, clock))
    start_time = dt.time()

    tot_r = []
    tot_loss = []
    it_r = []
    it_loss = []
    tot_frames = 0

    for e in range(episodes):
        r = []
        loss = []

        state = env.reset()
        state = preprocess(state)
        state = state[None,:]

        current_life = agent.start_life

        for t in range(time):
            if rend_env == 'y':

            action = agent.act(state)
            new_state, reward, terminal_life, life = env.step(action)
            new_state = preprocess(new_state)
            new_state = new_state[None,:]

            if life['ale.lives'] < current_life:
                reward = -1
               current_life = life['ale.lives']

            agent.remember(state, action, reward, new_state, terminal_life)

            hist = agent.replay(batch_size)
            agent.train_target()

            state = new_state

            r.append(reward)
            tot_frames += 1

            if hist is None:
                loss.append(0.0)
            else:
                loss.append(hist.history['loss'][0])

            if t%20 == 0:
                print('Frame : {}, Cum Reward = {}, Avg Loss = {}, Curr Life: {}'.format(t,
                                                                                       np.sum(r),
                                                                                       round(np.mean(loss[-20:-1]),3),
                                                                                       current_life))

                agent.model.save('{}/Mod_Fig/DQN_BO_model_{}.h5'.format(filepath, date))
                agent.model.save_weights('{}/Mod_Fig/DQN_BO_weights_{}.h5'.format(filepath, date))

            if current_life == 0 or terminal_life:
                print('Episode {} of {}, Cum Reward = {}, Avg Loss = {}'.format(e, episodes, np.sum(r), np.mean(loss)))
                break

        tot_r.append(np.sum(r))
        tot_loss.append(np.mean(loss))
        it_r.append(r)
        it_loss.append(loss)

    print('Training ended on {} at {}'.format(date, clock))
    run_time = dt.time() - start_time
    print('Total Training time: %d Hrs %d Mins $d s' % (run_time // 3600, (run_time % 3600) // 60),
      (run_time % 3600) % 60 // 1)

    if save_model == 'y':
        agent.model.save('{}/Mod_Fig/DQN_BO_finalmodel_{}_{}.h5'.format(filepath, date, clock))
        agent.model.save_weights('{}/Mod_Fig/DQN_BO_finalweights_{}_{}.h5'.format(filepath, date, clock))

    agent.model.summary()

    return tot_r, tot_loss, it_r, it_loss, tot_frames


if __name__ == '__main__':
    episodes = 3
    total_reward, total_loss, rewards_iter, loss_iter, frames_epi = main(episodes=episodes)

我们将非常感谢您的评论，并为您提供编写内存和快速高效的深度RL代码的帮助！我希望以突破5000集的方式训练我的DQN，但远程服务器最多只能进行48小时的训练。预先感谢！

Answer 1

听起来您有内存泄漏。

此行

agent.remember(state, action, reward, new_state, terminal_life)

被调用5000 * env._max_episode_steps次，每个state是一个(210, 160, 3)数组。要尝试的第一件事是减小self.memory = deque(maxlen=1000000) # specify memory size的大小，以验证这是唯一的原因。

如果您真的认为需要那么多容量，则应将self.memory转储到磁盘上，并在内存中保留一小部分子样本。

此外：从双端队列进行子采样非常慢，双端队列是作为链接列表实现的，因此每个子样本均为O（N * M）。您应该考虑为self.memory实现自己的环形缓冲区。

或者：：您可能会考虑使用概率缓冲区（我不知道合适的名字），每次您要附加到完整缓冲区时，都随机删除一个元素并附加新元素。这意味着遇到的任何（状态，动作，奖励等）元组都有非零的概率包含在缓冲区中，而最近的元组比旧元组的可能性更大。

Answer 2

我在记忆方面也有类似的问题，但现在仍然如此。

造成大量内存消耗的主要原因是states。但是，这是我做的更好的事情：

步骤1 ：使用84 x 84将它们调整大小为openCV样本。有人反而将图像降采样到84 x 84。这导致每个state的形状为(84,84,3)。

第2步：将这些帧转换为灰度（基本上是黑白）。这应该将形状更改为(84,84,1)。

步骤3 ：使用dtype=np.uint8存储states。它们消耗的内存最少，非常适合0-255范围内的像素强度值。

其他信息

我在免费的Google Collab笔记本（K80 Tesla GPU和13GB RAM）上运行代码，并定期将重播缓冲区保存到驱动器上。

对于第1步和第2步，请考虑使用OpenAI基线Atari wrappers，因为没有必要重新发明轮子。

您也可以像我一样，通过此代码段来检查您自己的程序在每个步骤中使用的RAM数量：

import os
import psutil

def show_RAM_usage(self):
    py = psutil.Process(os.getpid())
    print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))

此代码段已修改为在original answer

的我自己的程序中使用

如何减少用于深度强化学习算法的内存使用量？

2 个答案: