DQN TensorFlow代码非常快地用完内存

时间:2018-12-07 16:50:17

标签: python tensorflow memory-leaks

我正在尝试使用DQN训练乌龟机器人模拟。 Turtle bot应该在迷宫中找到目标。这相当简单,而且正在收敛。我的问题是,经过几次跑步,训练将变得非常缓慢。刚开始时速度很快,但是运行50个小时后速度非常慢。我已经检查了问题,我的CPU甚至没有使用50%,但是我的内存被耗尽了,大约98%的内存被占用了。我在代码中的某处正在泄漏内存,我认为这是在DQN代理的初始化中。您能指导我解决什么问题以及如何解决它。

非常感谢。

这是基于带有优先级缓冲区的DQN的训练代码:

#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gym
import gym_gazebo
import tensorflow as tf
import numpy as np
import time
import random
from random import *
import cv2
from gym import wrappers
from skimage import transform 
import datetime

import liveplot
from dqn_agent_withTarget import DQNAgent
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#--------------------------------------------------------------------------------------------------------------------------------------
def render():
    render_skip       = 0 #Skip first X episodes.
    render_interval   = 50 #Show render Every Y episodes.
    render_episodes   = 10 #Show Z episodes every rendering.

    if (x%render_interval == 0) and (x != 0) and (x > render_skip):
        env.render()
    elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
        env.render(close=True)

#--------------------------------------------------------------------------------------------------------------------------------------

if __name__ == '__main__':
    #------------------------------------------------------------------------
    env               = gym.make('GazeboCircuit2TurtlebotLidar-v0')
    outdir            = '/tmp/gazebo_gym_experiments'
    env               = gym.wrappers.Monitor(env, outdir, force=True)
    plotter           = liveplot.LivePlot(outdir)
    last_time_steps   = np.ndarray(0)
    start_time        = time.time()
    total_episodes    = 1000
    max_steps         = 200
    highest_reward    = 0
    gamma             = 0.95
    num_actions       = 3
    action_space      = [0,1,2]
    tf.reset_default_graph()                             # Reset training graph                                   
    myinit            = tf.global_variables_initializer()# Initialize training network 

    #tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.set_verbosity(tf.logging.ERROR)
    #------------------------------------------------------------------------
    agent             = DQNAgent(action_space,"GazeboCircuit2TurtlebotLidar-v0")

    agent.exploration = 1
    cv2.namedWindow("window", 1)
    x_val = np.random.rand(4096,256).astype(np.float32)
    agent.W_fc1.load(x_val, session=agent.sess)

    for e in range(total_episodes):
        # reset
        linecount = 0
        terminal= False
        win     = 0
        frame   = 0
        loss    = 0.0
        Q_max   = 0.0
        steps   = 0
        reward_t= 0.0
        env.reset()
        cumulated_rewards  = 0
        agent.exploration *= 0.9
        if agent.exploration<0.1:
            agent.exploration=0.1

        _, reward, terminal, info = env.step(0)

        linecount += 1
        print( "Time %s, %s" %(linecount,datetime.datetime.now()))
        img_tmp     = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
        linecount += 1
        print( "Time %s, %s" %(linecount,datetime.datetime.now()))
        state_t_1   = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
        state_t_1   = tf.reshape(state_t_1,(-1,32,32,4))


        while (not terminal):
            steps  += 1
            state_t = state_t_1

            # execute action in environment
            action_t = agent.select_action(state_t, agent.exploration)
            _, reward_t, terminal, info = env.step(action_t)
            #print("step: ", steps, "action: ",action_t ,"reward: ", reward_t)
            print(action_t , end="")
            img_tmp     = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
            state_t_1   = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
            state_t_1   = tf.reshape(state_t_1,(-1,32,32,4))
            # store experience
            agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal)
            # experience replay
            agent.experience_replay()
            #print(agent.sess.run(agent.W_fc1))

            # for log
            frame += 1
            loss  += agent.current_loss
            Q_max += np.max(agent.Q_values(state_t))
            cumulated_rewards += reward_t



        print(" ")
        print("episodes:",e," steps:",steps," loss:",'{0:.2f}'.format(loss/(steps+1)), " terminal:",terminal, " exploration_factor:",agent.exploration , " reward:", '{0:.2f}'.format(cumulated_rewards))
        plotter.plot(env)
        #print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
        #    e, total_episodes - 1, cumulated_rewards, loss / frame, Q_max / frame))
        env._flush(force=True)
        # save model
        weights=agent.sess.run(agent.W_fc1)
        print(weights)
        weights_tmp     = cv2.resize(weights, (256,256), interpolation=cv2.INTER_NEAREST)
        weights_image   = tf.image.convert_image_dtype(weights_tmp, dtype=tf.float32)
        cv2.imshow("window",agent.sess.run(weights_image))
        cv2.waitKey(1)

    # save model
    agent.save_model()    

    env.close()

这是DQN代理代码:(我认为问题出在DQN代理代码的初始化程序中)

from collections import deque
import os

import numpy as np
import tensorflow as tf


class DQNAgent:
"""
Multi Layer Perceptron with Experience Replay
"""

def __init__(self, enable_actions, environment_name):
    # parameters
    self.name = os.path.splitext(os.path.basename(__file__))[0]
    self.environment_name = environment_name
    self.enable_actions = enable_actions
    self.n_actions = len(self.enable_actions)
    self.minibatch_size = 64
    self.replay_memory_size = 1000
    self.learning_rate = 0.001
    self.discount_factor = 0.9
    self.exploration = 1.0
    self.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
    self.model_name = "{}.ckpt".format(self.environment_name)

    # replay memory
    self.D = deque(maxlen=self.replay_memory_size)

    # model
    self.init_model()

    # variables
    self.current_loss = 0.0

def init_model(self):

    #policy##################################################################################

    # input layer (32 x 32 x 4)
    self.x = tf.placeholder(tf.float32, [None, 32, 32,4])

    # convolution layer
    self.W_cv1 = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
    self.b_cv1 = tf.Variable(tf.zeros([4]))
    self.c_cv1 = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
    self.h_cv1 = tf.nn.relu(self.c_cv1 + self.b_cv1)

    # flatten (4096)
    self.x_flat = tf.reshape(self.h_cv1, [-1,4096])

    # fully connected layer [1,256]
    self.W_fc1 = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
    self.b_fc1 = tf.Variable(tf.zeros([256]))
    self.h_fc1 = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)

    # fully connected layer [1,32]
    self.W_fc2 = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
    self.b_fc2 = tf.Variable(tf.zeros([32]))
    self.h_fc2 = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)

    # output layer (n_actions)
    self.W_out  = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
    self.b_out  = tf.Variable(tf.zeros([self.n_actions]))
    self.y = tf.matmul(self.h_fc2, self.W_out) + self.b_out

    # loss function
    self.y_   = tf.placeholder(tf.float32, [None, self.n_actions])
    self.loss = tf.reduce_mean(tf.square(self.y_ - self.y))

    # train operation
    optimizer = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
    self.training = optimizer.minimize(self.loss)

    #target######################################################################################

    # input layer (32 x 32 x 4)
    self.x_t = tf.placeholder(tf.float32, [None, 32, 32,4])

    # convolution layer
    self.W_cv1_t = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
    self.b_cv1_t = tf.Variable(tf.zeros([4]))
    self.c_cv1_t = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
    self.h_cv1_t = tf.nn.relu(self.c_cv1 + self.b_cv1)

    # flatten (4096)
    self.x_flat_t = tf.reshape(self.h_cv1, [-1,4096])

    # fully connected layer [1,256]
    self.W_fc1_t = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
    self.b_fc1_t = tf.Variable(tf.zeros([256]))
    self.h_fc1_t = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)

    # fully connected layer [1,32]
    self.W_fc2_t = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
    self.b_fc2_t = tf.Variable(tf.zeros([32]))
    self.h_fc2_t = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)

    # output layer (n_actions)
    self.W_out_t  = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
    self.b_out_t  = tf.Variable(tf.zeros([self.n_actions]))
    self.y_t = tf.matmul(self.h_fc2, self.W_out) + self.b_out

    # loss function
    self.y__t   = tf.placeholder(tf.float32, [None, self.n_actions])
    self.loss_t = tf.reduce_mean(tf.square(self.y_ - self.y))

    # train operation
    optimizer_t = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
    self.training_t = optimizer.minimize(self.loss)

    #general################################################################################

    # saver
    self.saver = tf.train.Saver()

    # session
    self.sess = tf.Session()
    self.sess.run(tf.global_variables_initializer())

def Q_values(self, state):
    # Q(state, action) of all actions
    #print("QQQ VALUES______________________________________________",self.sess.run(state))
    x_tmp             = self.sess.run(state)
    return self.sess.run(self.y, feed_dict={self.x: x_tmp})#[0]

def select_action(self, state, epsilon):
    if np.random.rand() <= epsilon:
        # random
        return np.random.choice(self.enable_actions)
    else:
        # max_action Q(state, action)
        #print("G" , end="")
        return self.enable_actions[np.argmax(self.Q_values(state))]

def store_experience(self, state, action, reward, state_1, terminal):
    self.D.append((state, action, reward, state_1, terminal))

def experience_replay(self):
    state_minibatch = []
    y_minibatch = []

    # sample random minibatch
    minibatch_size = min(len(self.D), self.minibatch_size)
    minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size)

    for j in minibatch_indexes:
        state_j, action_j, reward_j, state_j_1, terminal = self.D[j]
        action_j_index = self.enable_actions.index(action_j)

        y_j = self.Q_values(state_j)[0]

        if terminal:
            y_j[action_j_index] = reward_j
        else:
            # reward_j + gamma * max_action' Q(state', action')
            y_j[action_j_index] = reward_j + self.discount_factor * np.max(self.Q_values(state_j_1))  # NOQA

        x_tmp = self.sess.run(state_j)
        y_j=np.reshape(y_j,(1,3))
        state_minibatch.append(x_tmp[0])
        y_minibatch.append(y_j[0])

    # training 
    self.sess.run(self.training, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})

    # for log
    self.current_loss = self.sess.run(self.loss, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})

def load_model(self, model_path=None):
    if model_path:
        # load from model_path
        self.saver.restore(self.sess, model_path)
    else:
        # load from checkpoint
        checkpoint = tf.train.get_checkpoint_state(self.model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)

def save_model(self):
    self.saver.save(self.sess, os.path.join(self.model_dir, self.model_name))

感谢您的帮助。

0 个答案:

没有答案