我正在尝试使用DQN训练乌龟机器人模拟。 Turtle bot应该在迷宫中找到目标。这相当简单,而且正在收敛。我的问题是,经过几次跑步,训练将变得非常缓慢。刚开始时速度很快,但是运行50个小时后速度非常慢。我已经检查了问题,我的CPU甚至没有使用50%,但是我的内存被耗尽了,大约98%的内存被占用了。我在代码中的某处正在泄漏内存,我认为这是在DQN代理的初始化中。您能指导我解决什么问题以及如何解决它。
非常感谢。
这是基于带有优先级缓冲区的DQN的训练代码:
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import gym_gazebo
import tensorflow as tf
import numpy as np
import time
import random
from random import *
import cv2
from gym import wrappers
from skimage import transform
import datetime
import liveplot
from dqn_agent_withTarget import DQNAgent
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#--------------------------------------------------------------------------------------------------------------------------------------
def render():
render_skip = 0 #Skip first X episodes.
render_interval = 50 #Show render Every Y episodes.
render_episodes = 10 #Show Z episodes every rendering.
if (x%render_interval == 0) and (x != 0) and (x > render_skip):
env.render()
elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
env.render(close=True)
#--------------------------------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':
#------------------------------------------------------------------------
env = gym.make('GazeboCircuit2TurtlebotLidar-v0')
outdir = '/tmp/gazebo_gym_experiments'
env = gym.wrappers.Monitor(env, outdir, force=True)
plotter = liveplot.LivePlot(outdir)
last_time_steps = np.ndarray(0)
start_time = time.time()
total_episodes = 1000
max_steps = 200
highest_reward = 0
gamma = 0.95
num_actions = 3
action_space = [0,1,2]
tf.reset_default_graph() # Reset training graph
myinit = tf.global_variables_initializer()# Initialize training network
#tf.logging.set_verbosity(tf.logging.INFO)
tf.logging.set_verbosity(tf.logging.ERROR)
#------------------------------------------------------------------------
agent = DQNAgent(action_space,"GazeboCircuit2TurtlebotLidar-v0")
agent.exploration = 1
cv2.namedWindow("window", 1)
x_val = np.random.rand(4096,256).astype(np.float32)
agent.W_fc1.load(x_val, session=agent.sess)
for e in range(total_episodes):
# reset
linecount = 0
terminal= False
win = 0
frame = 0
loss = 0.0
Q_max = 0.0
steps = 0
reward_t= 0.0
env.reset()
cumulated_rewards = 0
agent.exploration *= 0.9
if agent.exploration<0.1:
agent.exploration=0.1
_, reward, terminal, info = env.step(0)
linecount += 1
print( "Time %s, %s" %(linecount,datetime.datetime.now()))
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
linecount += 1
print( "Time %s, %s" %(linecount,datetime.datetime.now()))
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
while (not terminal):
steps += 1
state_t = state_t_1
# execute action in environment
action_t = agent.select_action(state_t, agent.exploration)
_, reward_t, terminal, info = env.step(action_t)
#print("step: ", steps, "action: ",action_t ,"reward: ", reward_t)
print(action_t , end="")
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
# store experience
agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal)
# experience replay
agent.experience_replay()
#print(agent.sess.run(agent.W_fc1))
# for log
frame += 1
loss += agent.current_loss
Q_max += np.max(agent.Q_values(state_t))
cumulated_rewards += reward_t
print(" ")
print("episodes:",e," steps:",steps," loss:",'{0:.2f}'.format(loss/(steps+1)), " terminal:",terminal, " exploration_factor:",agent.exploration , " reward:", '{0:.2f}'.format(cumulated_rewards))
plotter.plot(env)
#print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
# e, total_episodes - 1, cumulated_rewards, loss / frame, Q_max / frame))
env._flush(force=True)
# save model
weights=agent.sess.run(agent.W_fc1)
print(weights)
weights_tmp = cv2.resize(weights, (256,256), interpolation=cv2.INTER_NEAREST)
weights_image = tf.image.convert_image_dtype(weights_tmp, dtype=tf.float32)
cv2.imshow("window",agent.sess.run(weights_image))
cv2.waitKey(1)
# save model
agent.save_model()
env.close()
这是DQN代理代码:(我认为问题出在DQN代理代码的初始化程序中)
from collections import deque
import os
import numpy as np
import tensorflow as tf
class DQNAgent:
"""
Multi Layer Perceptron with Experience Replay
"""
def __init__(self, enable_actions, environment_name):
# parameters
self.name = os.path.splitext(os.path.basename(__file__))[0]
self.environment_name = environment_name
self.enable_actions = enable_actions
self.n_actions = len(self.enable_actions)
self.minibatch_size = 64
self.replay_memory_size = 1000
self.learning_rate = 0.001
self.discount_factor = 0.9
self.exploration = 1.0
self.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
self.model_name = "{}.ckpt".format(self.environment_name)
# replay memory
self.D = deque(maxlen=self.replay_memory_size)
# model
self.init_model()
# variables
self.current_loss = 0.0
def init_model(self):
#policy##################################################################################
# input layer (32 x 32 x 4)
self.x = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1 = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1 = tf.Variable(tf.zeros([4]))
self.c_cv1 = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1 = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1 = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1 = tf.Variable(tf.zeros([256]))
self.h_fc1 = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2 = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2 = tf.Variable(tf.zeros([32]))
self.h_fc2 = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out = tf.Variable(tf.zeros([self.n_actions]))
self.y = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y_ = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training = optimizer.minimize(self.loss)
#target######################################################################################
# input layer (32 x 32 x 4)
self.x_t = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1_t = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1_t = tf.Variable(tf.zeros([4]))
self.c_cv1_t = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1_t = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat_t = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1_t = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1_t = tf.Variable(tf.zeros([256]))
self.h_fc1_t = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2_t = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2_t = tf.Variable(tf.zeros([32]))
self.h_fc2_t = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out_t = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out_t = tf.Variable(tf.zeros([self.n_actions]))
self.y_t = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y__t = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss_t = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer_t = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training_t = optimizer.minimize(self.loss)
#general################################################################################
# saver
self.saver = tf.train.Saver()
# session
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def Q_values(self, state):
# Q(state, action) of all actions
#print("QQQ VALUES______________________________________________",self.sess.run(state))
x_tmp = self.sess.run(state)
return self.sess.run(self.y, feed_dict={self.x: x_tmp})#[0]
def select_action(self, state, epsilon):
if np.random.rand() <= epsilon:
# random
return np.random.choice(self.enable_actions)
else:
# max_action Q(state, action)
#print("G" , end="")
return self.enable_actions[np.argmax(self.Q_values(state))]
def store_experience(self, state, action, reward, state_1, terminal):
self.D.append((state, action, reward, state_1, terminal))
def experience_replay(self):
state_minibatch = []
y_minibatch = []
# sample random minibatch
minibatch_size = min(len(self.D), self.minibatch_size)
minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size)
for j in minibatch_indexes:
state_j, action_j, reward_j, state_j_1, terminal = self.D[j]
action_j_index = self.enable_actions.index(action_j)
y_j = self.Q_values(state_j)[0]
if terminal:
y_j[action_j_index] = reward_j
else:
# reward_j + gamma * max_action' Q(state', action')
y_j[action_j_index] = reward_j + self.discount_factor * np.max(self.Q_values(state_j_1)) # NOQA
x_tmp = self.sess.run(state_j)
y_j=np.reshape(y_j,(1,3))
state_minibatch.append(x_tmp[0])
y_minibatch.append(y_j[0])
# training
self.sess.run(self.training, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
# for log
self.current_loss = self.sess.run(self.loss, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
def load_model(self, model_path=None):
if model_path:
# load from model_path
self.saver.restore(self.sess, model_path)
else:
# load from checkpoint
checkpoint = tf.train.get_checkpoint_state(self.model_dir)
if checkpoint and checkpoint.model_checkpoint_path:
self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
def save_model(self):
self.saver.save(self.sess, os.path.join(self.model_dir, self.model_name))
感谢您的帮助。